### ingest data

In [164]:
import pandas as pd

In [165]:
data = pd.read_parquet('../data/data_preprocessed.parquet', engine='pyarrow')
data.head(3)

Unnamed: 0,id,topic,question_title,question_content,best_answer,has_emoji,topic_name,clean_question_title,clean_question_content,clean_best_answer
0,700000,3,Are Monte verde golden toads extict or endange...,I have to do a report and model on an endanger...,"Since 1989, not a single Golden Toad has been ...",False,Education & Reference,monte verde golden toads extict endanger,report model endanger amphibiani want montever...,since 1989 single golden toad see anywhere wor...
1,700001,6,"looking for a book titled "" Medical Filing"" by...",,amazon.com \nhttp://www.amazon.com/gp/search/r...,False,Business & Finance,look book title medical file theresa claeys kn...,,amazoncom nhttpwwwamazoncomgpsearchrefbrsshs10...
2,700002,7,ShoulD i StoP?,Should i stop asking dumb questions? \n\nAm i ...,"why should you stop, it is your life do what y...",False,Entertainment & Music,stop,stop ask dumb question nnam bug youare annoy w...,stop life want love ask dumb question rock


In [166]:
# data = data.sample(10000)
# data.head(3)

### setup

In [167]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [168]:
count_vectorizer = CountVectorizer()
count_vectorizer

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


tfidf

In [169]:
from sklearn.feature_extraction.text import TfidfTransformer

In [170]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
tfidf

0,1,2
,norm,'l2'
,use_idf,True
,smooth_idf,True
,sublinear_tf,False


tokenizer

In [171]:
from nltk.tokenize import word_tokenize

def nltk_word_tokenizer(text):
    text = word_tokenize(text)
    return text

sample="As you can see, the word tokenizer."
nltk_word_tokenizer(sample)

['As', 'you', 'can', 'see', ',', 'the', 'word', 'tokenizer', '.']

### training model

split data

In [172]:
from sklearn.model_selection import train_test_split


X = data['clean_question_title'] + " " + data['clean_question_content'] + " " + data["clean_best_answer"]
y = data["topic"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [173]:
print(X_train.shape, y_train.shape)

(1120000,) (1120000,)


In [174]:
print(X_test.shape, y_test.shape)

(280000,) (280000,)


build model

In [175]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [176]:
tfidf = TfidfVectorizer(
    strip_accents=None, 
    lowercase=True, 
    preprocessor=None, 
    tokenizer=nltk_word_tokenizer
)

tfidf

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function nlt...0024FF38A77E0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [177]:
small_param_grid = [
    {
        'clf__alpha': [0.1],
        'clf__fit_prior': [True]
    }
]

small_param_grid

[{'clf__alpha': [0.1], 'clf__fit_prior': [True]}]

In [178]:
from sklearn.naive_bayes import MultinomialNB


estimator = Pipeline(
    [
        ('vect', tfidf), ('clf', MultinomialNB())
    ]
)

estimator

0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function nlt...0024FF38A77E0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [179]:
grid_search = GridSearchCV(
    estimator, small_param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1
)

grid_search

0,1,2
,estimator,Pipeline(step...inomialNB())])
,param_grid,"[{'clf__alpha': [0.1], 'clf__fit_prior': [True]}]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function nlt...0024FF38A77E0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [180]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits




0,1,2
,estimator,Pipeline(step...inomialNB())])
,param_grid,"[{'clf__alpha': [0.1], 'clf__fit_prior': [True]}]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function nlt...0024FF38A77E0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,True
,class_prior,


In [181]:
print(f'Best parameter set: {grid_search.best_params_}')

Best parameter set: {'clf__alpha': 0.1, 'clf__fit_prior': True}


In [182]:
print(f'CV Accuracy: {grid_search.best_score_:.2f}')

CV Accuracy: 0.70


In [183]:
best_estimator = grid_search.best_estimator_
best_estimator

0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function nlt...0024FF38A77E0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,True
,class_prior,


In [184]:
predictions = best_estimator.predict(X_test)
predictions[:3]

array([5, 0, 1])

In [185]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
accuracy

0.7013357142857143

In [186]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.59      0.56      0.57     27858
           1       0.73      0.73      0.73     28107
           2       0.73      0.80      0.77     27917
           3       0.60      0.43      0.50     27694
           4       0.84      0.86      0.85     28043
           5       0.88      0.84      0.86     27976
           6       0.58      0.50      0.54     28072
           7       0.72      0.68      0.70     28252
           8       0.60      0.82      0.69     27967
           9       0.73      0.78      0.75     28114

    accuracy                           0.70    280000
   macro avg       0.70      0.70      0.70    280000
weighted avg       0.70      0.70      0.70    280000



mlflow

In [28]:
from mlflow.models import infer_signature

In [29]:
signature = infer_signature(X_train, y_train)
signature



inputs: 
  ['question_title': string (required)]
outputs: 
  ['topic': long (required)]
params: 
  None

In [None]:
import pickle


with open("../model/model_bays.pkl", "wb") as f:
    pickle.dump(best_estimator, f)

In [42]:
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("initial experiment")

with mlflow.start_run():
    mlflow.log_params(grid_search.best_params_) 
    mlflow.log_metric("accuracy", accuracy)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(best_estimator, "model", registered_model_name="Best RandomForest Model")
    else:
        mlflow.sklearn.log_model(best_estimator, "model")


2025/11/05 23:59:11 INFO mlflow.tracking.fluent: Experiment with name 'initial experiment' does not exist. Creating a new experiment.
Successfully registered model 'Best RandomForest Model'.
2025/11/05 23:59:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best RandomForest Model, version 1
Created version '1' of model 'Best RandomForest Model'.


🏃 View run mercurial-vole-998 at: http://127.0.0.1:5000/#/experiments/416290560738877779/runs/ebd96163dc284cf4aa032bd34ed05965
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/416290560738877779
