In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,category,transformed text
0,4,tv futur hand viewer home theatr system plasma...
1,0,worldcom boss leav book alon former worldcom b...
2,3,tiger wari farrel gambl leicest say rush make ...
3,3,yead face newcastl fa cup premiership side new...
4,1,ocean twelv raid box offic ocean twelv crime c...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=2000)

In [5]:
x= cv.fit_transform(df['transformed text']).toarray()
y= df['category'].values

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2)

In [15]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

In [16]:
model_params= {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1, 5, 10, 15, 20],
            'kernel': ['linear', 'poly', 'rbf']
        }
    },
    'random_forest':{
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators': range(1,6),
            'criterion': ['gini', 'entropy', 'log_loss']
        }
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params':{
            'C': [1, 5, 10, 15, 10],
            'penalty': ['l1','l2'],
        }
    },
    'gaussian_nb':{
        'model': GaussianNB(priors=None, var_smoothing=1e-09),
        'params':{
            # No hyperparameters to tune for this model type yet...
        }
    },
    'multinomial_nb':{
        'model': MultinomialNB(alpha=1, force_alpha='warn', ),
        'params':{
            #'alpha' : np.linspace(.01,.99, num = 3)
        }
    },
    'bernoulli_nb':{
        'model': BernoulliNB(alpha=1, force_alpha='warn', binarize=0.0, fit_prior=True, class_prior=None),
        'params':{
            #'alpha' : np.linspace(.01,.99, num = 3)
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(random_state= 0),
        'params':{
            'criterion': ['gini', 'entropy'],
            'max_depth':[1,2,3,4]
        }
    }
}

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
scores= []

for model_name, mp in model_params.items():
    clf= GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [19]:
df= pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.951968,"{'C': 5, 'kernel': 'rbf'}"
1,random_forest,0.837488,"{'criterion': 'log_loss', 'n_estimators': 5}"
2,logistic_regression,0.959165,"{'C': 1, 'penalty': 'l2'}"
3,gaussian_nb,0.881523,{}
4,multinomial_nb,0.962368,{}
5,bernoulli_nb,0.943158,{}
6,decision_tree,0.616472,"{'criterion': 'gini', 'max_depth': 4}"
