In [1]:
import sklearn
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/cleaned_text.csv')

In [3]:
df.head()

Unnamed: 0,category,transformed text
0,4,tv futur hand viewer home theatr system plasma...
1,0,worldcom boss leav book alon former worldcom b...
2,3,tiger wari farrel gambl leicest say rush make ...
3,3,yead face newcastl fa cup premiership side new...
4,1,ocean twelv raid box offic ocean twelv crime c...


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=2000)

In [7]:
x= cv.fit_transform(df['transformed text']).toarray()
y= df['category'].values

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state = 42)

In [9]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

In [20]:
model_params= {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1, 5, 10, 15, 20],
            'kernel': ['linear', 'poly', 'rbf']
        }
    },
    'random_forest':{
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators': range(1,6),
            'criterion': ['gini', 'entropy', 'log_loss']
        }
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params':{
            'C': [1, 5, 10, 15, 10],
            'penalty': ['l1','l2'],
        }
    },
    'gaussian_nb':{
        'model': GaussianNB(priors=None, var_smoothing=1e-09),
        'params':{
            # No hyperparameters to tune for this model type yet...
        }
    },
    'multinomial_nb':{
        'model': MultinomialNB(alpha=1),
        'params':{
            #'alpha' : np.linspace(.01,.99, num = 3)
        }
    },
    'bernoulli_nb':{
        'model': BernoulliNB(alpha=1, binarize=0.0, fit_prior=True, class_prior=None),
        'params':{
            #'alpha' : np.linspace(.01,.99, num = 3)
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(random_state= 0),
        'params':{
            'criterion': ['gini', 'entropy'],
            'max_depth':[1,2,3,4]
        }
    }
}

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
scores= []

for model_name, mp in model_params.items():
    clf= GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

25 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/joblib/parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/joblib/parallel.py", l

In [23]:
df= pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.963529,"{'C': 5, 'kernel': 'rbf'}"
1,random_forest,0.854118,"{'criterion': 'gini', 'n_estimators': 5}"
2,logistic_regression,0.967059,"{'C': 1, 'penalty': 'l2'}"
3,gaussian_nb,0.893529,{}
4,multinomial_nb,0.968824,{}
5,bernoulli_nb,0.952941,{}
6,decision_tree,0.615882,"{'criterion': 'gini', 'max_depth': 4}"
