# Import datasets

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# Import test and train datasets

In [None]:
df_train = pd.read_csv('../data/processed/aclImdb/aclImdb_train.csv')
df_test = pd.read_csv('../data/processed/aclImdb/aclImdb_test.csv')

# Drop 1st unused column

In [None]:
df_train = df_train.drop(df_train.columns[0], axis=1)
df_test = df_test.drop(df_test.columns[0], axis=1)

# Shuffle train and test dataframes

In [None]:
df_train = df_train.sample(frac=1)
df_test = df_test.sample(frac=1)

# Split train and test dataframes into X_train, y_train, X_test, y_test

In [None]:
X_train = df_train['text']
y_train = df_train['sentiment']
X_test = df_test['text']
y_test = df_test['sentiment']

# Transform pd objects into np objects

In [None]:
X_train = X_train.values
y_train = y_train.values

# Vectorize texts

- [CountVectorizer](https://kavita-ganesan.com/how-to-use-countvectorizer/#.Yidh1hso8UE)
- [How to work with text data - Sklearn](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [None]:
vec = CountVectorizer(stop_words='english')
X_train_trans = vec.fit_transform(X_train)
X_test_trans = vec.transform(X_test)

### TF-IDF technique for text classification

- [TF-IDF](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3)

In [None]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_trans)
X_train_tf = tf_transformer.transform(X_train_trans)

In [None]:
X_test_tf = tf_transformer.transform(X_test_trans)

# Naive Bayes classifier

- [Naive Bayes - Sklearn](https://scikit-learn.org/stable/modules/naive_bayes.html)
- [MultinomialNB - Sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)
- [Sentiment Analysis with Naive Bayes](https://www.analyticsvidhya.com/blog/2021/07/performing-sentiment-analysis-with-naive-bayes-classifier/)

In [None]:
clf = MultinomialNB().fit(X_train_tf, y_train)

In [None]:
predicted = clf.predict(X_test_tf)
np.mean(predicted == y_test)

# Predicted dataframe

In [None]:
res = pd.DataFrame(data=predicted, columns=['target'])

In [None]:
res.to_csv('../data/processed/aclImdb/results/classifier_name.csv')