In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
df.head()

Unnamed: 0,subject,transformed text
0,1,donald trump wish american happi new year leav...
1,1,hous intellig committe chairman devin nune go ...
2,1,friday reveal former milwauke sheriff david cl...
3,1,christma day donald trump announc would back w...
4,1,pope franci use annual christma day messag reb...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=5000)

In [5]:
x= cv.fit_transform(df['transformed text']).toarray()
y= df['subject'].values

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state = 42)

In [7]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [8]:
model_params = {
    'SVM with RBF Kernel': {
        'model': svm.SVC(),
        'params': {'C': [1.0,1.2], 'kernel': ['rbf'], 'gamma': ['auto']}
    },
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {'C': [0.1,0.2], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {'n_estimators': [200], 'max_depth': [20], 'max_features': ['auto'],
                   'min_samples_split': [2], 'min_samples_leaf': [2]}
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {'criterion': ['entropy'], 'max_depth': [10], 'min_samples_split': [10],
                   'min_samples_leaf': [1]}
    },
    'Gaussian Naive Bayes': {
        'model': GaussianNB(),
        'params': {}  # No hyperparameters to tune for Gaussian Naive Bayes
    },
    'Bernoulli Naive Bayes': {
        'model': BernoulliNB(),
        'params': {'alpha': [0.1]}  # Adjust the alpha parameter for Bernoulli Naive Bayes
    },
    'MultiNomial Naive Bayes': {
        'model' : MultinomialNB(),
        'params' : {'alpha': [0.001]}
    }
    
}

In [9]:
from sklearn.model_selection import GridSearchCV
import time

In [10]:
scores= []
start = time.time()
for model_name, mp in model_params.items():
    clf= GridSearchCV(mp['model'], mp['params'], cv=2, verbose = 4)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    print("1 epoch Done")

end = time.time()
print(start)
print(end)
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2] END .....C=1.0, gamma=auto, kernel=rbf;, score=0.763 total time=13.3min
[CV 2/2] END .....C=1.0, gamma=auto, kernel=rbf;, score=0.766 total time=12.9min
[CV 1/2] END .....C=1.2, gamma=auto, kernel=rbf;, score=0.766 total time=13.2min
[CV 2/2] END .....C=1.2, gamma=auto, kernel=rbf;, score=0.770 total time=12.5min
1 epoch Done
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END C=0.1, penalty=l1, solver=liblinear;, score=0.770 total time=   1.6s
[CV 2/2] END C=0.1, penalty=l1, solver=liblinear;, score=0.778 total time=   1.6s
[CV 1/2] END C=0.1, penalty=l2, solver=liblinear;, score=0.751 total time=  10.2s




[CV 2/2] END C=0.1, penalty=l2, solver=liblinear;, score=0.759 total time=  11.3s
[CV 1/2] END C=0.2, penalty=l1, solver=liblinear;, score=0.764 total time=   1.9s
[CV 2/2] END C=0.2, penalty=l1, solver=liblinear;, score=0.771 total time=   1.9s




[CV 1/2] END C=0.2, penalty=l2, solver=liblinear;, score=0.745 total time=  11.9s




[CV 2/2] END C=0.2, penalty=l2, solver=liblinear;, score=0.755 total time=  11.0s
1 epoch Done
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.747 total time=  19.4s
[CV 2/2] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.752 total time=  19.4s
1 epoch Done
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END criterion=entropy, max_depth=10, min_samples_leaf=1, min_samples_split=10;, score=0.715 total time=   5.3s
[CV 2/2] END criterion=entropy, max_depth=10, min_samples_leaf=1, min_samples_split=10;, score=0.718 total time=   5.5s
1 epoch Done
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END ..................................., score=0.486 total time=   2.8s
[CV 2/2] END ..................................., score=0.499 total time=   2.8s
1 epoch Done
Fitting 2 fol

In [16]:
df= pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df =df.sort_values(by=['best_score'], ascending=False)
df

Unnamed: 0,model,best_score,best_params
1,Logistic Regression,0.773971,"{'C': 0.1, 'penalty': 'l1', 'solver': 'libline..."
0,SVM with RBF Kernel,0.768087,"{'C': 1.2, 'gamma': 'auto', 'kernel': 'rbf'}"
2,Random Forest,0.749623,"{'max_depth': 20, 'max_features': 'auto', 'min..."
3,Decision Tree,0.716493,"{'criterion': 'entropy', 'max_depth': 10, 'min..."
6,MultiNomial Naive Bayes,0.693101,{'alpha': 0.1}
5,Bernoulli Naive Bayes,0.692928,{'alpha': 0.1}
4,Gaussian Naive Bayes,0.492406,{}
