In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/CleanedData.csv')

In [3]:
df.head()

Unnamed: 0,subject,transformed text
0,1,donald trump wish american happi new year leav...
1,1,hous intellig committe chairman devin nune go ...
2,1,friday reveal former milwauke sheriff david cl...
3,1,christma day donald trump announc would back w...
4,1,pope franci use annual christma day messag reb...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=2000)

In [5]:
x= cv.fit_transform(df['transformed text']).toarray()
y= df['subject'].values

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state = 42)

In [7]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

In [8]:
model_params= {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1, 5, 10, 15, 20],
            'kernel': ['linear', 'poly', 'rbf']
        }
    },
    'random_forest':{
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators': range(1,6),
            'criterion': ['gini', 'entropy', 'log_loss']
        }
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params':{
            'C': [1, 5, 10, 15, 10],
            'penalty': ['l1','l2'],
        }
    },
    'gaussian_nb':{
        'model': GaussianNB(priors=None, var_smoothing=1e-09),
        'params':{
            # No hyperparameters to tune for this model type yet...
        }
    },
    'multinomial_nb':{
        'model': MultinomialNB(alpha=1),
        'params':{
            #'alpha' : np.linspace(.01,.99, num = 3)
        }
    },
    'bernoulli_nb':{
        'model': BernoulliNB(alpha=1, binarize=0.0, fit_prior=True, class_prior=None),
        'params':{
            #'alpha' : np.linspace(.01,.99, num = 3)
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(random_state= 0),
        'params':{
            'criterion': ['gini', 'entropy'],
            'max_depth':[1,2,3,4]
        }
    }
}

In [9]:
from sklearn.model_selection import GridSearchCV

In [None]:
scores= []

for model_name, mp in model_params.items():
    clf= GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
df= pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df