In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
df.head()

Unnamed: 0,subject,transformed text
0,1,donald trump wish american happi new year leav...
1,1,hous intellig committe chairman devin nune go ...
2,1,friday reveal former milwauke sheriff david cl...
3,1,christma day donald trump announc would back w...
4,1,pope franci use annual christma day messag reb...


In [4]:
df.shape

(43125, 2)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=5000)

In [6]:
df.isnull().sum()

subject             0
transformed text    0
dtype: int64

In [7]:
X= tfidf.fit_transform(df['transformed text']).toarray()
y= df['subject'].values

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
classifiers = {
    'Multinomial Naive Bayes': {
        'model': MultinomialNB(),
        'param_grid': {
            'alpha': [0.001,0.01],      # Additive smoothing parameter
        }
    },
    'Gaussian Naive Bayes': {
        'model': GaussianNB(),
        'param_grid': {}  # Gaussian Naive Bayes has no hyperparameters to tune
    },
    'Bernoulli Naive Bayes': {
        'model': BernoulliNB(),
        'param_grid': {
            'alpha': [0.001, 0.01],      # Additive smoothing parameter
            'binarize': [0.0,0.00001]    # Threshold for binarizing input features
        }
    }
}

In [10]:
best_model_name = None
best_model = None
best_accuracy = 0.0

# Perform Grid Search CV for each classifier
for clf_name, clf_params in classifiers.items():
    model = clf_params['model']
    param_grid = clf_params['param_grid']
    
    # Create a GridSearchCV object with the model and parameter grid
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    
    # Fit the grid search to your training data
    grid_search.fit(X_train, y_train)
    
    # Get the best hyperparameters and corresponding accuracy score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Check if the current model is the best so far
    if best_score > best_accuracy:
        best_accuracy = best_score
        best_model_name = clf_name
        best_model = grid_search.best_estimator_
    
    # Print the results for the current classifier
    print(f"{clf_name} - Best Hyperparameters:", best_params)
    print(f"{clf_name} - Best Accuracy Score:", best_score)
    print("\n")



Multinomial Naive Bayes - Best Hyperparameters: {'alpha': 0.001}
Multinomial Naive Bayes - Best Accuracy Score: 0.710927536231884


Gaussian Naive Bayes - Best Hyperparameters: {}
Gaussian Naive Bayes - Best Accuracy Score: 0.5283188405797101


Bernoulli Naive Bayes - Best Hyperparameters: {'alpha': 0.001, 'binarize': 0.0}
Bernoulli Naive Bayes - Best Accuracy Score: 0.696695652173913




In [11]:
# Print the best model among all classifiers
print(f"Best Model: {best_model_name}")
print(f"Best Model Accuracy: {best_accuracy}")

# Use the best model to make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the best model's performance on the test data
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Best Model - Test Accuracy:", accuracy)
print(f"Best Model - Confusion Matrix:\n", conf_matrix)
print(f"Best Model - Classification Report:\n", class_report)

Best Model: Multinomial Naive Bayes
Best Model Accuracy: 0.710927536231884
Best Model - Test Accuracy: 0.713159420289855
Best Model - Confusion Matrix:
 [[   7   45    0   38  185   12]
 [   1 1432    0   87  351    5]
 [   2   16   91   11   35   10]
 [  10  219    0  282  300   24]
 [  19  355    1  340 2505  255]
 [   0    1    0   11  141 1834]]
Best Model - Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.02      0.04       287
           1       0.69      0.76      0.73      1876
           2       0.99      0.55      0.71       165
           3       0.37      0.34      0.35       835
           4       0.71      0.72      0.72      3475
           5       0.86      0.92      0.89      1987

    accuracy                           0.71      8625
   macro avg       0.63      0.55      0.57      8625
weighted avg       0.70      0.71      0.70      8625



## Bernoulli Naive Bayes gives the best accuracy, so we will proceed with it