In [130]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [131]:
def label_encoding(Dataframe:"pd.core.frame.DataFrame",column_name_list:list):
    for column_name in column_name_list:    
        unique_value_array=np.sort(Dataframe[column_name].unique())
        for index,value in enumerate(unique_value_array):
            Dataframe[column_name].replace(value,index,inplace=True)
    return Dataframe
def one_and_hot(Dataframe:"pd.core.frame.DataFrame",column_name_list:list):
    for column_name in column_name_list:
        for value in Dataframe[column_name].unique():
            Dataframe[column_name+'='+value]=[True if x==value else False for x in Dataframe[column_name]]
        Dataframe.drop(labels=[column_name],axis=1,inplace=True)
    return Dataframe
def confusion_matrix(y_true,y_pred):
    '''
    y_true: np.array of true labels
    y_pred: np.array of prediction labels
    returns a matrix of shape (2,2) in form [[TP,FP],
                                             [FN,TN]]
    '''
    conf_matrix=np.zeros((2,2),dtype=int)
    conf_matrix[0,0]=np.sum(np.logical_and(y_pred,y_true)) ##tp
    conf_matrix[1,1]=np.sum(np.logical_and(np.logical_not(y_pred),np.logical_not(y_true))) ## tn
    conf_matrix[0,1]=np.sum(np.where(y_pred-y_true>0,1,0)) ##fp
    conf_matrix[1,0]=len(y_true)-np.sum(conf_matrix) #fn
    return conf_matrix
def summary_report(y_true,y_pred):
    '''
    y_true: binary array of actual classes
    y_pred: binary array of predicted classes
    returns: a dataframe containing the classification report
    '''
    confusion_mat=confusion_matrix(y_true,y_pred)
    accuracy=np.sum(confusion_mat.diagonal())/np.sum(confusion_mat)
    N=np.sum(confusion_mat)
    count_1=sum(confusion_mat[:,0])
    count_0=sum(confusion_mat[:,1])
    support=[count_1,count_0,N,N,N]
    precision_1=confusion_mat[0,0]/sum(confusion_mat[0])
    precision_0=confusion_mat[1,1]/sum(confusion_mat[1])
    recall_1=confusion_mat[0,0]/sum(confusion_mat[:,0])
    recall_0=confusion_mat[1,1]/sum(confusion_mat[:,1])
    f1_score_1=(2*precision_1*recall_1)/(precision_1+recall_1)
    f1_score_0=(2*precision_0*recall_0)/(precision_0+recall_0)
    f1_scores=[f1_score_1,f1_score_0,accuracy,np.average([f1_score_1,f1_score_0]),(f1_score_1*count_0+f1_score_0*count_1)/N]
    precisions=[precision_1,precision_0,'',np.average([precision_0,precision_1]),(precision_0*count_0+precision_1*count_1)/N]
    recalls=[recall_1,recall_0,'',np.average([recall_0,recall_1]),(recall_0*count_0+recall_1*count_1)/N]
    return pd.DataFrame({
            '':['1','0','accuracy','macro average','weighted average'],
            'precision':precisions,
            'recall':recalls,
            'f1-score':f1_scores,
            'support':support
            })
class CustomRandomForest:
    def __init__(self, n_estimators=10, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y,features:int=None):
        if(type(features))==type(None):
            features=X.shape[0]
        for _ in range(self.n_estimators):
            tree = DecisionTreeClassifier(max_depth=self.max_depth)
            indices=np.random.choice(X.shape[0],replace=True,size=X.shape[0])
            features_1=random.randint(3,features)
            column_indices=np.random.choice(X.shape[1],replace=False,size=features_1)
            tree.fit(X[indices][:,column_indices], y.reshape(y.size,1)[indices])
            self.trees.append((tree,column_indices))

    def predict(self, X):
        predictions = []
        for tree,column_indices in self.trees:
            predictions.append(tree.predict(X[:,column_indices]))
        # Handle potential voting strategies (e.g., majority vote)
        final_predictions = np.mean(predictions, axis=0).round()
        return final_predictions

In [119]:
data=pd.read_csv(r'E:\Current_Workspace\Programming\SVM\IMDB Dataset.csv')
data=label_encoding(data,column_name_list=['sentiment'])
sample_size=8000
data_1=data[data.sentiment==1].iloc[:sample_size//2]
data_0=data[data.sentiment==0].iloc[:sample_size//2]
data_new=pd.concat([data_0,data_1],axis=0)
X=data_new.review
y=data_new.sentiment
X_train, X_test, y_train, y_test = train_test_split(data_new.review,data_new.sentiment, test_size=0.2, random_state=42)

vectorizing the train and test data

In [120]:
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train)
transformed_test=vectorizer.transform(X_test)
X_test=vectorizer.transform(X_test)
y_test=y_test.to_numpy()

Storing the vectorizer

In [5]:
# pickle.dump(vectorizer,open('vectorizer_tfid.pkl','wb'))

# SVM

In [6]:
import sklearn.metrics

In [121]:
text_classifier=SVC(C=2.15,kernel='rbf')
text_classifier.fit(X=X_train,y=y_train)
# vectorizer=pickle.load(open(r'E:\Current_Workspace\Programming\SVM\Pickle\Main Models\vectorizer_tfid.pkl','rb'))
# text_classifier=pickle.load(open(r'E:\Current_Workspace\Programming\SVM\Pickle\Main Models\classifier_SVM.pkl','rb'))
y_pred=text_classifier.predict(X_test)

In [50]:
search_space={'C':[2.15,1],
              'kernel':['linear','rbf'],}

In [51]:
GS=GridSearchCV(estimator=SVC(),
                param_grid=search_space,
                scoring='accuracy',
                cv=5,
                n_jobs=8)

In [52]:
GS.fit(X_train,y_train)

In [53]:
GS.best_estimator_.get_params()

{'C': 2.15,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [56]:
best_SVM_2=GS.best_estimator_
y_pred=best_SVM_2.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[709, 116],
       [ 87, 688]])

In [16]:
confusion_matrix(y_test,y_pred)

23.2 µs ± 817 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [59]:
summary_report(y_test,y_pred)

Unnamed: 0,Unnamed: 1,precision,recall,f1-score,support
0,1,0.859394,0.890704,0.874769,796
1,0,0.887742,0.855721,0.871438,804
2,accuracy,,,0.873125,1600
3,macro average,0.873568,0.873212,0.873103,1600
4,weighted average,0.873639,0.873125,0.873111,1600


In [63]:
y_pred=best_SVM_2.predict(X_test)

In [65]:
print(confusion_matrix(y_test,y_pred))

[[709 116]
 [ 87 688]]


In [44]:
summary_report(y_test,y_pred).style.hide().to_excel('SVM.xlsx')
summary_report(y_train,y_pred).style.hide()

Unnamed: 0,precision,recall,f1-score,support
1,1.0,1.0,1.0,3204
0,1.0,1.0,1.0,3196
accuracy,,,1.0,6400
macro average,1.0,1.0,1.0,6400
weighted average,1.0,1.0,1.0,6400


In [66]:
pickle.dump(best_SVM_2,open('classifier_SVM.pkl','wb'))

# Naive Bayes

In [113]:
naive_bayes = MultinomialNB(alpha=1.17)
naive_bayes.fit(X_train, y_train)
y_pred = naive_bayes.predict(X_test)

In [114]:
print(confusion_matrix(y_test,y_pred))

[[628  76]
 [168 728]]


In [115]:
# summary_report(y_test,y_pred).style.hide().to_excel('NaiveBayes.xlsx')
summary_report(y_test,y_pred).style.hide()

Unnamed: 0,precision,recall,f1-score,support
1,0.892045,0.788945,0.837333,796
0,0.8125,0.905473,0.856471,804
accuracy,,,0.8475,1600
macro average,0.852273,0.847209,0.846902,1600
weighted average,0.852074,0.8475,0.846854,1600


In [116]:
pickle.dump(naive_bayes,open('classifier_NaiveBayes.pkl','wb'))

# Random Forest Built In

In [67]:
search_space={'n_estimators':np.array(range(50,150,10)),
              'random_state':[42],
              'max_depth':np.concatenate([[-1],range(2,21)]),
              'criterion':['gini','entropy']}

In [132]:
rf=RandomForestClassifier()
rf = RandomForestClassifier(n_estimators=130, random_state=42,max_depth=17)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [68]:
GS=GridSearchCV(estimator=RandomForestClassifier(),
                param_grid=search_space,
                scoring='accuracy',
                cv=5,
                n_jobs=6
                )

In [70]:
GS.fit(X_train,y_train)

100 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\admin\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\admin\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "c:\Users\admin\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\admin\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidPar

In [74]:
GS.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 17,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 130,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [78]:
new_rf_best=GS.best_estimator_
y_pred=new_rf_best.predict(X_test)

In [79]:
print(confusion_matrix(y_test,y_pred))

[[658 141]
 [138 663]]


In [80]:
# summary_report(y_test,y_pred).style.hide().to_excel('RandomForest.xlsx')
summary_report(y_test,y_pred).style.hide()

Unnamed: 0,precision,recall,f1-score,support
1,0.823529,0.826633,0.825078,796
0,0.827715,0.824627,0.826168,804
accuracy,,,0.825625,1600
macro average,0.825622,0.82563,0.825623,1600
weighted average,0.825633,0.825625,0.825621,1600


In [133]:
pickle.dump(rf,open('classifier_RandomForestBuiltIn.pkl','w'))

# Custom Random Forest

In [18]:
y_train=y_train.to_numpy()
custrf=CustomRandomForest(130)
custrf.fit(X_train, y_train)
cust_pred=custrf.predict(X_test)

In [19]:
confusion_matrix(y_test,cust_pred)

array([[607, 141],
       [189, 663]])

In [20]:
# summary_report(y_test,cust_pred).to_excel('RandomForestCustom.xlsx')
summary_report(y_test,cust_pred)

Unnamed: 0,Unnamed: 1,precision,recall,f1-score,support
0,1,0.811497,0.762563,0.786269,796
1,0,0.778169,0.824627,0.800725,804
2,accuracy,,,0.79375,1600
3,macro average,0.794833,0.793595,0.793497,1600
4,weighted average,0.79475,0.79375,0.793461,1600


In [90]:
search_space={'n_estimators':np.array(range(100,150,10))}

In [91]:
GS=GridSearchCV(estimator=CustomRandomForest(),
                param_grid=search_space,
                scoring='accuracy',
                cv=5,
                n_jobs=6
                )

In [87]:
GS.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 130,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [86]:
new_custrf=GS.best_estimator_
y_pred=new_custrf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
summary_report(y_test,y_pred)

[[655 134]
 [141 670]]


Unnamed: 0,Unnamed: 1,precision,recall,f1-score,support
0,1,0.830165,0.822864,0.826498,796
1,0,0.826141,0.833333,0.829721,804
2,accuracy,,,0.828125,1600
3,macro average,0.828153,0.828099,0.82811,1600
4,weighted average,0.828143,0.828125,0.828102,1600


In [88]:
# pickle.dump(new_custrf,open('classifier_RandomForestCustom.pkl','wb'))

# Custom test input

In [128]:
review='The movie was uninteresting'

In [129]:
# review='This movie was not interesting. It could have been more entertaining. This was a total waste of time.'
# review='This movie was interesting. It was fun.'
print(f'The review was {"positive" if (text_classifier.predict(vectorizer.transform([review].__iter__()))) else "negative"}')

The review was negative
