# Import datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# Import test and train datasets

In [2]:
df_train = pd.read_csv('../data/processed/aclImdb/aclImdb_train.csv')
df_test = pd.read_csv('../data/processed/aclImdb/aclImdb_test.csv')

# Drop 1st unused column

In [3]:
df_train = df_train.drop(df_train.columns[0], axis=1)
df_test = df_test.drop(df_test.columns[0], axis=1)

# Shuffle train and test dataframes

In [4]:
df_train = df_train.sample(frac=1)
df_test = df_test.sample(frac=1)

# Split train and test dataframes into X_train, y_train, X_test, y_test

In [5]:
X_train = df_train['text']
y_train = df_train['sentiment']
X_test = df_test['text']
y_test = df_test['sentiment']

# Transform pd objects into np objects

In [6]:
X_train = X_train.values
y_train = y_train.values

# Vectorize texts

- [CountVectorizer](https://kavita-ganesan.com/how-to-use-countvectorizer/#.Yidh1hso8UE)
- [How to work with text data - Sklearn](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [7]:
vec = CountVectorizer(stop_words='english')
X_train_trans = vec.fit_transform(X_train)
X_test_trans = vec.transform(X_test)

### TF-IDF technique for text classification

- [TF-IDF](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3)

In [8]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_trans)
X_train_tf = tf_transformer.transform(X_train_trans)

In [9]:
X_test_tf = tf_transformer.transform(X_test_trans)

# Naive Bayes classifier

- [Naive Bayes - Sklearn](https://scikit-learn.org/stable/modules/naive_bayes.html)
- [MultinomialNB - Sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)
- [Sentiment Analysis with Naive Bayes](https://www.analyticsvidhya.com/blog/2021/07/performing-sentiment-analysis-with-naive-bayes-classifier/)

In [10]:
print(type(X_train_tf), type(X_test_tf), type(y_test), type(y_train))
print("\n\n", X_train_tf.shape, X_test_tf.shape, y_test.shape, y_train.shape)


<class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'> <class 'pandas.core.series.Series'> <class 'numpy.ndarray'>


 (25000, 74538) (25000, 74538) (25000,) (25000,)


In [11]:
y_test_np = y_test.values

In [12]:
clf = MultinomialNB().fit(X_train_tf, y_train)

In [13]:
predicted = clf.predict(X_test_tf)
np.mean(predicted == y_test)

0.84308

In [14]:
print(type(predicted), type(y_test), type())
print(predicted.shape, y_test.shape)

<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
(25000,) (25000,)


# Predicted dataframe

In [15]:
print(predicted)
y_pred = pd.DataFrame(data=predicted, columns=['y_pred'])
print(y_pred)
print(y_test)
# y_test = pd.DataFrame(data=y_test, columns=['y_true'])
y_test.reset_index(drop=True, inplace=True)
print(y_test)


res = pd.concat([y_pred, y_test], axis=1)

[0 0 1 ... 0 0 0]
       y_pred
0           0
1           0
2           1
3           0
4           0
...       ...
24995       0
24996       0
24997       0
24998       0
24999       0

[25000 rows x 1 columns]
5962     1
4452     1
10880    1
3832     1
11330    1
        ..
24075    0
20741    0
20345    0
15548    0
24969    0
Name: sentiment, Length: 25000, dtype: int64
0        1
1        1
2        1
3        1
4        1
        ..
24995    0
24996    0
24997    0
24998    0
24999    0
Name: sentiment, Length: 25000, dtype: int64


In [21]:
y_test.head(20)
y_test.tail(20)

24980    1
24981    0
24982    1
24983    1
24984    0
24985    0
24986    0
24987    0
24988    1
24989    1
24990    1
24991    1
24992    1
24993    0
24994    0
24995    0
24996    0
24997    0
24998    0
24999    0
Name: sentiment, dtype: int64

In [22]:
res = pd.concat([y_test, y_pred], axis=1)
print(res.tail(25))

       sentiment  y_pred
24975          0       0
24976          1       1
24977          0       0
24978          0       0
24979          0       0
24980          1       1
24981          0       0
24982          1       1
24983          1       0
24984          0       0
24985          0       0
24986          0       0
24987          0       0
24988          1       0
24989          1       1
24990          1       1
24991          1       0
24992          1       1
24993          0       0
24994          0       0
24995          0       0
24996          0       0
24997          0       0
24998          0       0
24999          0       0


In [17]:
# res = pd.DataFrame(data=predicted, columns=['target'])

In [18]:
# res.to_csv('../data/processed/aclImdb/results/classifier_name.csv')