# Import datasets

In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('../data/processed/aclImdb/aclImdb_train.csv')
df_test = pd.read_csv('../data/processed/aclImdb/aclImdb_test.csv')

# Drop 1st unused column

In [2]:
df_train = df_train.drop(df_train.columns[0], axis=1)
df_test = df_test.drop(df_test.columns[0], axis=1)

print(df_test, df_train)

                                                    text  sentiment
0      It's very easy to figure out why The New Profe...          1
1      I couldn't believe that the Adult Swim guys ca...          1
2      Sean Astin pulls off another amazing performan...          1
3      I used to watch this show when I was growing u...          1
4      Exclusively for Coop's lovers, though Clint Ea...          1
...                                                  ...        ...
24995  Cameron Diaz, James Marsden, Frank Langella: t...          0
24996  The story was disjointed, the acting was not o...          0
24997  I was only cautiously enthusiast when renting ...          0
24998  A made for television version of the Heart of ...          0
24999  I found this film completely and utterly incom...          0

[25000 rows x 2 columns]                                                     text  sentiment
0      Hard to categorize the film - perhaps it's an ...          1
1      After the Super

# Shuffle train and test dataframes

In [3]:
df_train = df_train.sample(frac=1)
df_test = df_test.sample(frac=1)

print(df_test.shape, df_train.shape)

(25000, 2) (25000, 2)


# Split train and test dataframes into X_train, y_train, X_test, y_test

In [4]:
X_train = df_train['text']
y_train = df_train['sentiment']
X_test = df_test['text']
y_test = df_test['sentiment']
print(X_train.shape, y_train.shape)
print(type(X_train))

(25000,) (25000,)
<class 'pandas.core.series.Series'>


# Transform pd objects into np objects

In [5]:
X_train = X_train.values
y_train = y_train.values
print(y_train)

[0 0 0 ... 1 0 0]


# Add 1 dimension in X_train and X_test

In [6]:
# X_train = np.expand_dims(X_train, axis=1)
# X_test = np.expand_dims(X_test, axis=1)

# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Vectorize texts

In [7]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

vec = CountVectorizer(stop_words='english')
# # ret = vec.fit(["salut, je suis cool", "je le suis aussi"])
# # print(ret.get_feature_names_out())
# # ret_trans = ret.transform(["Bonjour je suis cool, tres cool"])
# # print(ret_trans.toarray())
X_train_trans = vec.fit_transform(X_train)
X_test_trans = vec.transform(X_test)
print(X_train_trans)

  (0, 15728)	1
  (0, 19956)	2
  (0, 16750)	1
  (0, 52456)	1
  (0, 63194)	2
  (0, 35657)	1
  (0, 33007)	1
  (0, 58107)	1
  (0, 35648)	5
  (0, 69483)	1
  (0, 50019)	1
  (0, 63603)	2
  (0, 8672)	16
  (0, 72317)	1
  (0, 38099)	1
  (0, 14461)	1
  (0, 9609)	4
  (0, 50387)	1
  (0, 61145)	2
  (0, 57674)	2
  (0, 53636)	2
  (0, 21716)	1
  (0, 65002)	1
  (0, 57511)	1
  (0, 21691)	1
  :	:
  (24999, 14598)	1
  (24999, 29588)	1
  (24999, 4523)	1
  (24999, 8488)	1
  (24999, 11724)	1
  (24999, 63855)	1
  (24999, 69476)	1
  (24999, 58647)	1
  (24999, 6694)	1
  (24999, 12102)	1
  (24999, 26462)	1
  (24999, 24833)	1
  (24999, 51497)	1
  (24999, 37396)	1
  (24999, 32844)	1
  (24999, 69917)	1
  (24999, 14005)	1
  (24999, 56498)	1
  (24999, 7226)	1
  (24999, 69490)	1
  (24999, 60695)	1
  (24999, 2122)	1
  (24999, 69228)	1
  (24999, 38720)	1
  (24999, 54251)	1


In [9]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_trans)
X_train_tf = tf_transformer.transform(X_train_trans)

In [14]:
X_test_tf = tf_transformer.transform(X_test_trans)

In [12]:
print(X_train_tf)

  (0, 1652)	0.04315318520021031
  (0, 1667)	0.08630637040042062
  (0, 3387)	0.08630637040042062
  (0, 3689)	0.04315318520021031
  (0, 4343)	0.04315318520021031
  (0, 5368)	0.04315318520021031
  (0, 6507)	0.04315318520021031
  (0, 6569)	0.04315318520021031
  (0, 6682)	0.04315318520021031
  (0, 6998)	0.04315318520021031
  (0, 8168)	0.12945955560063094
  (0, 8672)	0.690450963203365
  (0, 9444)	0.04315318520021031
  (0, 9609)	0.17261274080084124
  (0, 11557)	0.04315318520021031
  (0, 11577)	0.04315318520021031
  (0, 11748)	0.04315318520021031
  (0, 13439)	0.04315318520021031
  (0, 14461)	0.04315318520021031
  (0, 14678)	0.12945955560063094
  (0, 14710)	0.08630637040042062
  (0, 15278)	0.04315318520021031
  (0, 15728)	0.04315318520021031
  (0, 16478)	0.04315318520021031
  (0, 16496)	0.04315318520021031
  :	:
  (24999, 54573)	0.09901475429766744
  (24999, 55096)	0.09901475429766744
  (24999, 56067)	0.09901475429766744
  (24999, 56098)	0.09901475429766744
  (24999, 56108)	0.09901475429766744


In [13]:
clf = MultinomialNB().fit(X_train_tf, y_train)

In [17]:
predicted = clf.predict(X_test_tf)
np.mean(predicted == y_test)

0.84308

In [None]:
# from sklearn.pipeline import Pipeline
# text_clf = Pipeline([
# ('vect', CountVectorizer()),
# ('tfidf', TfidfTransformer()),
# ('clf', MultinomialNB()),
# ])

# text_clf.fit(X_train, y_train)

# predicted = text_clf.predict(X_test)

# np.mean(predicted == y_test)



# Model training

In [None]:
# import sklearn
# from sklearn.naive_bayes import GaussianNB

# gnb = GaussianNB()
# y_pred = gnb.fit(X_train, y_train)