In [24]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
df.head()

Unnamed: 0,subject,transformed text
0,1,donald trump wish american happi new year leav...
1,1,hous intellig committe chairman devin nune go ...
2,1,friday reveal former milwauke sheriff david cl...
3,1,christma day donald trump announc would back w...
4,1,pope franci use annual christma day messag reb...


In [4]:
df.shape

(43125, 2)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=5000)

In [6]:
df.isnull().sum()

subject             0
transformed text    0
dtype: int64

In [7]:
X= tfidf.fit_transform(df['transformed text'])
y= df['subject'].values

In [8]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)




In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X_resampled,y_resampled ,test_size=0.2,random_state=42)

In [15]:
from collections import Counter

label_distribution = Counter(y_resampled)

print(label_distribution)

Counter({1: 17560, 4: 17560, 0: 17560, 3: 17560, 2: 17560, 5: 17560})


In [26]:
def objective(params):
    if params['classifier']['type'] == 'MultinomialNB':
        classifier = MultinomialNB(alpha=params['classifier']['alpha'])
    elif params['classifier']['type'] == 'GaussianNB':
        classifier = GaussianNB(var_smoothing=params['classifier']['var_smoothing'])
    else:
        classifier = BernoulliNB(alpha=params['classifier']['alpha'], binarize=params['classifier']['binarize'])
    
    # Use cross-validation to evaluate the classifier
    scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy')
    
    # Return the negative accuracy (Hyperopt minimizes the objective function)
    return -scores.mean()


In [27]:
space = {
    'classifier': hp.choice('classifier', [
        {
            'type': 'MultinomialNB',
            'alpha': hp.loguniform('alpha_mnb', 1e-5, 1),
        },
        {
            'type': 'GaussianNB',
            'var_smoothing': hp.loguniform('var_smoothing', 1e-9, 1e-1),
        },
        {
            'type': 'BernoulliNB',
            'alpha': hp.loguniform('alpha_bnb', 1e-5, 1),
            'binarize': hp.uniform('binarize_bnb', 0.0, 1.0),
        }
    ])
}


In [28]:
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, rstate=np.random.default_rng(42))

print("Best Hyperparameters:")
print(best)

  8%|▋        | 4/50 [00:02<00:23,  1.95trial/s, best loss: -0.7339953514468988]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 16%|█▍       | 8/50 [00:04<00:23,  1.77trial/s, best loss: -0.7339953514468988]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 26%|██      | 13/50 [00:07<00:20,  1.80trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 30%|██▍     | 15/50 [00:08<00:17,  1.95trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 36%|██▉     | 18/50 [00:10<00:16,  1.96trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 52%|████▏   | 26/50 [00:15<00:13,  1.81trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 54%|████▎   | 27/50 [00:15<00:10,  2.18trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 56%|████▍   | 28/50 [00:16<00:08,  2.55trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 58%|████▋   | 29/50 [00:16<00:07,  2.92trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 60%|████▊   | 30/50 [00:16<00:06,  3.26trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 62%|████▉   | 31/50 [00:16<00:05,  3.53trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 64%|█████   | 32/50 [00:17<00:04,  3.77trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 66%|█████▎  | 33/50 [00:17<00:04,  3.93trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 68%|█████▍  | 34/50 [00:17<00:04,  3.93trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 70%|█████▌  | 35/50 [00:17<00:03,  4.05trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 72%|█████▊  | 36/50 [00:18<00:03,  4.14trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 74%|█████▉  | 37/50 [00:18<00:03,  4.23trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 76%|██████  | 38/50 [00:18<00:02,  4.27trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 78%|██████▏ | 39/50 [00:18<00:02,  4.28trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 80%|██████▍ | 40/50 [00:18<00:02,  4.32trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 82%|██████▌ | 41/50 [00:19<00:02,  4.32trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 84%|██████▋ | 42/50 [00:19<00:01,  4.30trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 86%|██████▉ | 43/50 [00:19<00:01,  4.32trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 88%|███████ | 44/50 [00:19<00:01,  4.34trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 90%|███████▏| 45/50 [00:20<00:01,  4.36trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 92%|███████▎| 46/50 [00:20<00:00,  4.37trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 94%|███████▌| 47/50 [00:20<00:00,  4.39trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 96%|███████▋| 48/50 [00:20<00:00,  4.39trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

 98%|███████▊| 49/50 [00:21<00:00,  4.36trial/s, best loss: -0.7340902639865547]

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

100%|████████| 50/50 [00:21<00:00,  2.35trial/s, best loss: -0.7340902639865547]
Best Hyperparameters:
{'alpha_mnb': 1.0421333860331026, 'classifier': 0}


5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 246, in fit
    X, y, np.unique(y), _refit=True, sample_weight=sample_weight
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
  File "/home/roronoa/.conda/envs/tf/lib/python3.7/site-packag

In [30]:
mnb = MultinomialNB( alpha = 1.0421333860331026)
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)
f1= f1_score(y_test,y_pred, average='weighted')
print(f1)

0.7300965272191351


## Multinomial Naive Bayes gives the best accuracy, so we will proceed with it

# Using Word2Vec vectors for training

In [26]:
df = pd.read_csv('word2vecData.csv')


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43125 entries, 0 to 43124
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  43125 non-null  int64 
 1   vec      43125 non-null  object
dtypes: int64(1), object(1)
memory usage: 674.0+ KB


In [31]:
import ast

# Convert the string representation to a list
df['vec'] = df['vec'].apply(ast.literal_eval)

# Verify the data type of the column
print(type(df['vec'].iloc[0]))

<class 'list'>


In [32]:
X = df['vec'].to_list()
y = df['subject'].to_list()

In [33]:
X[0]

[0.017181913731461863,
 0.054051674018471926,
 0.02501760364252295,
 0.1145188498631709,
 -0.07854115222133486,
 0.013064093508962857,
 0.03582854190115201,
 -0.06757971122439972,
 0.07362481025652697,
 0.06868105958410575,
 -0.04545136360125353,
 -0.07671412236272952,
 -0.0677699288405941,
 0.027317779885847018,
 -0.07981907041732875,
 0.07597549352268715,
 0.0445443708344368,
 0.0843538618357168,
 -0.014792016670528778,
 -0.10628513831876765,
 0.006775034349516961,
 0.018582618842690676,
 0.07434383489317813,
 -0.03250578971905897,
 0.025516833289194914,
 0.015695469527594787,
 -0.06637474641961566,
 0.05962376136564265,
 0.06179611292262535,
 -0.01286162899038886,
 0.005549328475348694,
 0.011451300928148172,
 -0.053575073931850285,
 -0.01066235903292726,
 0.017091136867717162,
 0.004068493169579802,
 0.01690352703891905,
 0.029450562040684587,
 0.033840976865951625,
 0.06556245836160951,
 0.09580701892658816,
 -0.0669507387667726,
 0.11927528165828037,
 0.04103657350701801,
 -0.020

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = mnb.predict(X_test)

# Calculate weighted F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the classification report, including precision, recall, and F1 score for each class
class_report = classification_report(y_test, y_pred)

print("Weighted F1 Score:",f1)
print("Classification Report:\n", class_report)

Weighted F1 Score: 0.7303612414248749
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.64      0.65      3597
           1       0.72      0.78      0.75      3496
           2       0.94      0.89      0.92      3520
           3       0.55      0.62      0.58      3443
           4       0.65      0.52      0.58      3494
           5       0.86      0.95      0.90      3522

    accuracy                           0.73     21072
   macro avg       0.73      0.73      0.73     21072
weighted avg       0.73      0.73      0.73     21072

