# Model Training and Hyperparameter Tuning 

In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("Toxic-Comment-Classification-Challenge-master/data/train.csv")
labels = data.iloc[:,2:]

In [10]:
"""
this cell splits the data into training test and validation
"""

import pickle
with open("project_variables/Xtrain.pickle",'rb') as file:
    comments = pickle.load(file)
Xtrain,XTest,Ytrain,YTest = train_test_split(comments,labels,test_size=0.2,random_state=42)
Xtest,Xval,Ytest,Yval = train_test_split(XTest,YTest,test_size=0.5,random_state=42)

In [11]:
Xtrain.shape,Ytrain.shape

((127656, 19109), (127656, 6))

In [12]:
Xtest.shape,Ytest.shape

((15957, 19109), (15957, 6))

In [13]:
Xval.shape,Yval.shape

((15958, 19109), (15958, 6))

In [14]:
NBModel = MultinomialNB()
SVCModel = LinearSVC()
LRModel = LogisticRegression()

In [15]:
cols = list(labels.columns)
cols

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [19]:
def modelTraining(model):
        for col in cols:
            model.fit(Xtrain,Ytrain[col])
            predictions = model.predict(Xtrain)
            print(f"""
            In {col}
            actual = {data[col][1900]}
            predicted = {predictions[1900]}
            """)

In [17]:
models = [NBModel,SVCModel,LRModel]

In [20]:
np.random.seed(42)
for model in models :
    print(f"in {model.__class__.__name__.split('.')[-1]}")
    modelTraining(model)

in MultinomialNB

            In toxic
            actual = 1
            predicted = 0
            

            In severe_toxic
            actual = 0
            predicted = 0
            

            In obscene
            actual = 1
            predicted = 0
            

            In threat
            actual = 0
            predicted = 0
            

            In insult
            actual = 1
            predicted = 0
            

            In identity_hate
            actual = 0
            predicted = 0
            
in LinearSVC





            In toxic
            actual = 1
            predicted = 0
            





            In severe_toxic
            actual = 0
            predicted = 0
            





            In obscene
            actual = 1
            predicted = 0
            





            In threat
            actual = 0
            predicted = 0
            





            In insult
            actual = 1
            predicted = 0
            





            In identity_hate
            actual = 0
            predicted = 0
            
in LogisticRegression

            In toxic
            actual = 1
            predicted = 0
            

            In severe_toxic
            actual = 0
            predicted = 0
            

            In obscene
            actual = 1
            predicted = 0
            

            In threat
            actual = 0
            predicted = 0
            

            In insult
            actual = 1
            predicted = 0
            

            In identity_hate
            actual = 0
            predicted = 0
            


## Logistic Regression Hyperparamter Tuning

In [28]:
np.random.seed(42)
parameter_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
                  'class_weight': [None, 'balanced']}
cross_validation = StratifiedKFold(n_splits=5)
lrGrid = GridSearchCV(estimator=LRModel,
                      param_grid = parameter_grid,
                     cv= cross_validation,
                     scoring="f1")
for col in cols:
    print(f"in {col}")
    lrGrid.fit(Xtrain,Ytrain[col])
    print(f"Best paramters : {lrGrid.best_params_}")

in toxic
Best paramters : {'class_weight': 'balanced', 'solver': 'liblinear'}
in severe_toxic
Best paramters : {'class_weight': 'balanced', 'solver': 'newton-cg'}
in obscene
Best paramters : {'class_weight': 'balanced', 'solver': 'liblinear'}
in threat
Best paramters : {'class_weight': 'balanced', 'solver': 'newton-cg'}
in insult
Best paramters : {'class_weight': 'balanced', 'solver': 'liblinear'}
in identity_hate
Best paramters : {'class_weight': 'balanced', 'solver': 'newton-cg'}


## Linear SVC Hyperparameter Tuning

In [31]:
np.random.seed(42)
import warnings
warnings.filterwarnings('ignore')

parameter_grid_svc = {
    'C': [1,10,20],
    'penalty': ['l1','l2'],
    'loss':['hinge','squared_hinge']
}
svcRand = RandomizedSearchCV(
                            estimator = SVCModel,
                            param_distributions=parameter_grid_svc,
                            cv=cross_validation,
                            scoring="accuracy",
                            verbose=True
)
for col in cols:
    print(f"in {col}")
    svcRand.fit(Xtrain,Ytrain[col])
    print(f"Best paramters : {svcRand.best_params_}")

in toxic
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best paramters : {'penalty': 'l2', 'loss': 'hinge', 'C': 1}
in severe_toxic
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best paramters : {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 1}
in obscene
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best paramters : {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 1}
in threat
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best paramters : {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 1}
in insult
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best paramters : {'penalty': 'l2', 'loss': 'hinge', 'C': 1}
in identity_hate
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best paramters : {'penalty': 'l2', 'loss': 'hinge', 'C': 1}


In [32]:
import pickle
with open("project_variables/models/MultinomialNB.pickle",'wb') as file:
    pickle.dump(NBModel,file)
with open("project_variables/models/LogisticRegression.pickle",'wb') as file:
    pickle.dump(LRModel,file)
with open("project_variables/models/LinearSVC.pickle",'wb') as file:
    pickle.dump(SVCModel,file)
with open("project_variables/models/RandomLinearSVC.pickle",'wb') as file:
    pickle.dump(lrGrid,file)
with open("project_variables/models/GridLogistic.pickle",'wb') as file:
    pickle.dump(svcRand,file)