In [None]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import f1_score, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
#load data
train = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv')
test = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv')

In [None]:
train.head(10)

In [None]:
train.dtypes

In [None]:
bar = train.groupby('sentiment').count().plot(kind='bar', title='Distribution', legend=False)#checking for class imbalance(mild imbalance)

In [None]:
train['sentiment'].value_counts()

In [None]:
train.isna().sum()#check for missing features

In [None]:
def cleaning(text):
  text = text.lower()#tfidf is case-sensitive hence all text should be converted to same case
  text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)#to remove unicode symbols that are useless
  return text

In [None]:
train.drop(['movieid', 'reviewerName', 'isFrequentReviewer'], axis=1, inplace=True)
train.dropna(inplace = True)#drop missing rows
train['reviewText'] = train['reviewText'].str.replace('\d+','',regex=True) #remove digits
train['reviewText'] = train['reviewText'].apply(lambda x: cleaning(str(x)))

In [None]:
test.drop(['movieid', 'reviewerName', 'isTopCritic'], axis=1, inplace=True)
imp = SimpleImputer(strategy='most_frequent', copy=False)
test['reviewText'] = imp.fit_transform(test)#impute missing values sicne we can't drop rows from test set
test['reviewText'] = test['reviewText'].str.replace('\d+','',regex=True) #remove digits
test['reviewText'] = test['reviewText'].apply(lambda x: cleaning(str(x)))

In [None]:
imp.statistics_

In [None]:
test.head(10)

In [None]:
train.head(10)

In [None]:
plt.figure(figsize = (6, 6))

labels = ['POSITIVE', 'NEGATIVE']
colors = ['#4E9CE4', '#F14C4C']
plt.pie(train['sentiment'].value_counts(), autopct='%0.2f%%',colors=colors)

plt.title('Distribution', size=12)
plt.legend(labels, ncol=2, loc=9)
plt.show()

In [None]:
target = train['sentiment']
train = train.drop(['sentiment'], axis=1)
target = label_binarize(target, classes=['NEGATIVE', 'POSITIVE'])#convert categorical variable to a numerical variable

In [None]:
norm = Normalizer(copy=False)#best practice is to normalize before doing dimensionality reduction to avoid skew
tfidfvec = TfidfVectorizer(max_features=50000)
tsvd = TruncatedSVD(n_components=5000)#runs fastest on large datasets, IncrementalPCA is an alternative
preproc = Pipeline([('tfidf', tfidfvec), ('norm', norm), ('svd', tsvd)])

In [None]:
#more features result in better training scores but beyond 5000 memory fills up
train = pd.DataFrame(data=preproc.fit_transform(train['reviewText']), columns=preproc['svd'].get_feature_names_out())
test = pd.DataFrame(data=preproc.transform(test['reviewText']), columns=preproc['svd'].get_feature_names_out())

In [None]:
tsvd.explained_variance_ratio_ #very less variance is explained by individual features hence more fetaures the better

In [None]:
plot = plt.scatter(train.iloc[:,0], train.iloc[:,1], c=target.ravel())

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
train.describe() #mean and variance very close to 0

In [None]:
param_grid = {'C':[0.1, 1, 10]}
LogGCV = GridSearchCV(LogisticRegression(max_iter=10000), param_grid=param_grid, scoring='f1_micro', refit=True, pre_dispatch=3).fit(train,target.ravel())

In [None]:
LogGCV.best_params_, LogGCV.best_score_

In [None]:
LogGCV.cv_results_

In [None]:
LogMod = LogisticRegression(C=10, max_iter=10000, class_weight={0: 0.52, 1: 0.48}).fit(train,target.ravel())
f1_score(target.ravel(), LogMod.predict(train), average='micro')

In [None]:
ConfusionMatrixDisplay.from_estimator(LogMod, train, target.ravel()) #best estimator

In [None]:
param_grid = {'C':[0.1,0.4,0.6,0.8]}
SvGCV = GridSearchCV(LinearSVC(dual=False, penalty='l2', random_state=42), param_grid=param_grid, scoring='f1_micro', refit=True, pre_dispatch=4, cv=3).fit(train,target.ravel())
SvGCV.best_score_, SvGCV.best_params_

In [None]:
LinSVC = LinearSVC(C=0.4, dual=False, penalty='l2', random_state=42).fit(train, target.ravel())
f1_score(target.ravel(), LinSVC.predict(train), average='micro')

In [None]:
ConfusionMatrixDisplay.from_estimator(LinSVC, train, target.ravel())

Comparison between LinearSVM SGD and LogisticRegression SGD

In [None]:
SGDC_Hinge = SGDClassifier(alpha=0.0001, penalty='l2', loss='hinge', random_state=42).fit(train, target.ravel())
f1_score(target.ravel(), SGDC_Hinge.predict(train), average='micro')

In [None]:
SGDC_Log = SGDClassifier(alpha=0.0001, penalty='l2', loss='log_loss', random_state=42).fit(train, target.ravel())
f1_score(target.ravel(), SGDC_Log.predict(train), average='micro')

In [None]:
AdaBC = AdaBoostClassifier(estimator=LogisticRegression(C=10, max_iter=10000, class_weight={0: 0.52, 1: 0.48}), n_estimators=50, learning_rate=1.0, random_state=42).fit(train, target.ravel())
f1_score(target.ravel(), AdaBC.predict(train), average='micro')

In [None]:
ConfusionMatrixDisplay.from_estimator(AdaBC, train, target.ravel())

AdaBoost using a LogisticRegression estimator only predicted the positive class.

In [None]:
XGBC = XGBClassifier(n_estimators=20, max_depth=10, objective='binary:logistic', verbosity=0, eval_metric=f1_score, tree_method='hist').fit(train, target.ravel())
f1_score(target.ravel(), XGBC.predict(train), average='micro')

XGBoost uses too much RAM, hence impractical where number of features > 50000

In [None]:
RFC = RandomForestClassifier(n_estimators=25, max_depth=4, random_state=42).fit(train, target.ravel())
f1_score(target.ravel(), RFC.predict(train), average='micro')

**Cross-validation using LogisticRegression (best training score yet)**

In [None]:
cv = StratifiedKFold(n_splits=3, shuffle=False)
cross_val = cross_validate(LogisticRegression(C=10, max_iter=10000, class_weight={0: 0.52, 1: 0.48}), train, target.ravel(), cv=cv, pre_dispatch=2, scoring='f1_micro')
cross_val['test_score'] #model is not overfitting or underfitting

In [None]:
cross_val = cross_validate(LinearSVC(C=0.4, dual=False, penalty='l2', random_state=42), train, target.ravel(), cv=cv, pre_dispatch=2, scoring='f1_micro')
cross_val['test_score']

In [None]:
cross_val = cross_validate(SGDClassifier(alpha=0.0001, penalty='l2', loss='hinge', random_state=42), train, target.ravel(), cv=cv, pre_dispatch=2, scoring='f1_micro')
cross_val['test_score']

In [None]:
test_pred = LogMod.predict(test)

In [None]:
submission = pd.DataFrame(columns=['id','sentiment'])
submission['id'] = [i for i in range(55315)]

In [None]:
submission['sentiment'] = test_pred

In [None]:
submission['sentiment'] = submission['sentiment'].replace(1, 'POSITIVE')
submission['sentiment'] = submission['sentiment'].replace(0, 'NEGATIVE')
submission.head(10)

In [None]:
submission.to_csv('submission.csv', index=False)