### ingest data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/data_preprocessed.parquet', engine='pyarrow')
data.head(3)

Unnamed: 0,id,topic,question_title,question_content,best_answer,topic_name
0,0,4,why doesn t an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...,Computers & Internet
1,1,5,what is the best off road motorcycle trail,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...,Sports
2,2,2,what is trans fat how to reduce that,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...,Health


### setup

vectorizer

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
count_vectorizer = CountVectorizer()
count_vectorizer

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


tfidf

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

In [6]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
tfidf

0,1,2
,norm,'l2'
,use_idf,True
,smooth_idf,True
,sublinear_tf,False


tokenizer

In [7]:
def tokenizer(text):
    return text.split()

stemming

In [8]:
from nltk.stem.porter import PorterStemmer

In [9]:
porter_stemming = PorterStemmer()
porter_stemming

<PorterStemmer>

In [10]:
def tokenizer_porter(text):
    return [porter_stemming.stem(word) for word in text.split()]

stopwords

In [11]:
%%capture
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Seu
[nltk_data]     Computador\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
from nltk.corpus import stopwords


stop = stopwords.words("english")
stop[:3]

['a', 'about', 'above']

### training model

split data

In [13]:
from sklearn.model_selection import train_test_split


X = data["question_title"]
y = data["topic"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
print(X_train.shape, y_train.shape)

(560000,) (560000,)


In [15]:
print(X_test.shape, y_test.shape)

(140000,) (140000,)


build model

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [17]:
tfidf = TfidfVectorizer(
    strip_accents=None, lowercase=False, preprocessor=None
)

tfidf

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [18]:
small_param_grid = [{'vect__ngram_range': [(1, 1)],
                     'vect__stop_words': [None],
                     'vect__tokenizer': [tokenizer, tokenizer_porter],
                     'clf__penalty': ['l2'],
                     'clf__C': [1.0, 10.0]},
                    {'vect__ngram_range': [(1, 1)],
                     'vect__stop_words': [stop, None],
                     'vect__tokenizer': [tokenizer],
                     'vect__use_idf':[False],
                     'vect__norm':[None],
                     'clf__penalty': ['l2'],
                  'clf__C': [1.0, 10.0]},
              ]


small_param_grid[0]

{'vect__ngram_range': [(1, 1)],
 'vect__stop_words': [None],
 'vect__tokenizer': [<function __main__.tokenizer(text)>,
  <function __main__.tokenizer_porter(text)>],
 'clf__penalty': ['l2'],
 'clf__C': [1.0, 10.0]}

In [19]:
estimator = Pipeline(
    [
        ('vect', tfidf), ('clf', LogisticRegression(solver='liblinear'))
    ]
)

estimator

0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [20]:
grid_search = GridSearchCV(
    estimator, small_param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1
)

grid_search

0,1,2
,estimator,Pipeline(step...liblinear'))])
,param_grid,"[{'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'], 'vect__ngram_range': [(1, ...)], 'vect__stop_words': [None], ...}, {'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'], 'vect__ngram_range': [(1, ...)], 'vect__norm': [None], ...}]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [21]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




0,1,2
,estimator,Pipeline(step...liblinear'))])
,param_grid,"[{'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'], 'vect__ngram_range': [(1, ...)], 'vect__stop_words': [None], ...}, {'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'], 'vect__ngram_range': [(1, ...)], 'vect__norm': [None], ...}]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function tok...001DDC2D05BC0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [22]:
print(f'Best parameter set: {grid_search.best_params_}')

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer_porter at 0x000001DDC2D05BC0>}


In [23]:
print(f'CV Accuracy: {grid_search.best_score_:.2f}')

CV Accuracy: 0.64


In [24]:
best_estimator = grid_search.best_estimator_
best_estimator

0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function tok...001DDC2D05BC0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [25]:
predictions = best_estimator.predict(X_test)
predictions[:3]

array([8, 8, 9])

In [41]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
accuracy

0.6477928571428572

mlflow

In [28]:
from mlflow.models import infer_signature

In [29]:
signature = infer_signature(X_train, y_train)
signature



inputs: 
  ['question_title': string (required)]
outputs: 
  ['topic': long (required)]
params: 
  None

In [None]:
import pickle


with open("../model/model.pkl", "wb") as f:
    pickle.dump(best_estimator, f)

In [None]:
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("initial experiment")

with mlflow.start_run():
    mlflow.log_params(grid_search.best_params_) 
    mlflow.log_metric("accuracy", accuracy)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(best_estimator, "model", registered_model_name="Best RandomForest Model")
    else:
        mlflow.sklearn.log_model(best_estimator, "model")


2025/11/05 23:59:11 INFO mlflow.tracking.fluent: Experiment with name 'initial experiment' does not exist. Creating a new experiment.
