# Import datasets

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# Import dataset

In [37]:
df = pd.read_csv('../src/tests/dataset_allocine_100.csv')

In [38]:
X = df[['text']]
y = df[['Positive', 'Negative', 'Neutral']]
print(X, '\n\n\n', y)

                                                 text
0   Magnifique épopée, une belle histoire, touchan...
1   Je n'ai pas aimé mais pourtant je lui mets 2 é...
2   Un dessin animé qui brille par sa féerie et se...
3   Si c'est là le renouveau du cinéma français, c...
4   Et pourtant on s’en Doutait !Second volet très...
..                                                ...
95  Cette satire sur le fuhrer est drôle, un humou...
96  Ce documentaire m'a complètement retournée. Un...
97  Nul. Vraiment nul. Même si le synopsis peut êt...
98  Le point négatif c'est que ce film n'est pas u...
99  02 / 20 Mon dieux ! Un film vraiment hideux , ...

[100 rows x 1 columns] 


     Positive  Negative  Neutral
0          1         0        0
1          0         1        0
2          1         0        0
3          0         1        0
4          0         1        0
..       ...       ...      ...
95         0         1        0
96         1         0        0
97         0         1        0
98  

# Split into train and test

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [40]:
print(X_train.head(), X_test.head(), y_train.head(), y_test.head())
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

                                                 text
54  Je vais très rarement voir des films français....
19  (...) chaque petite lueur d’espoir engendrée p...
84  Alors que j'avais été déçu par monument's men,...
95  Cette satire sur le fuhrer est drôle, un humou...
36  Jan de Bont signe un film d'action survitaminé...                                                  text
67  ce film est pour moi a moitié un documentaire ...
1   Je n'ai pas aimé mais pourtant je lui mets 2 é...
62  Si le début suscite notre intérêt, la suite ne...
25  Ce film est superbe. J'ai hésité à le regarder...
71  Film inabouti qui ne vaut que pour son actrice...     Positive  Negative  Neutral
54         1         0        0
19         0         1        0
84         1         0        0
95         0         1        0
36         1         0        0     Positive  Negative  Neutral
67         1         0        0
1          0         1        0
62         0         1        0
25         1         0        0


# Transform pd objects into np objects

In [41]:
X_train = X_train.values
X_test = X_test.values

In [42]:
y_train_1d = y_train['Positive'] + (y_train['Negative'] * (-1))
y_test_1d = y_test['Positive'] + (y_test['Negative'] * (-1))
y_train_1d = y_train_1d.values
y_test_1d = y_test_1d.values

In [43]:
X_train = X_train.squeeze()
X_test = X_test.squeeze()

In [44]:
print(type(X_train), X_train.shape)
print(type(X_test), X_test.shape)
print(type(y_train_1d), y_train_1d.shape)
print(type(y_test_1d), y_test_1d.shape)
y_test_1d

<class 'numpy.ndarray'> (50,)
<class 'numpy.ndarray'> (50,)
<class 'numpy.ndarray'> (50,)
<class 'numpy.ndarray'> (50,)


array([ 1, -1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1, -1,
       -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,
       -1,  1,  1, -1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1])

# Vectorize texts

- [CountVectorizer](https://kavita-ganesan.com/how-to-use-countvectorizer/#.Yidh1hso8UE)
- [How to work with text data - Sklearn](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [45]:
vec = CountVectorizer()
X_train_trans = vec.fit_transform(X_train)
X_test_trans = vec.transform(X_test)

### TF-IDF technique for text classification

- [TF-IDF](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3)

In [46]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_trans)
X_train_tf = tf_transformer.transform(X_train_trans)

In [47]:
X_test_tf = tf_transformer.transform(X_test_trans)

# Naive Bayes classifier

- [Naive Bayes - Sklearn](https://scikit-learn.org/stable/modules/naive_bayes.html)
- [MultinomialNB - Sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)
- [Sentiment Analysis with Naive Bayes](https://www.analyticsvidhya.com/blog/2021/07/performing-sentiment-analysis-with-naive-bayes-classifier/)

In [48]:
print(type(X_train_tf), type(X_test_tf), type(y_test_1d), type(y_train_1d))
print("\n\n", X_train_tf.shape, X_test_tf.shape, y_test_1d.shape, y_train_1d.shape)


<class 'scipy.sparse.csr.csr_matrix'> <class 'scipy.sparse.csr.csr_matrix'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>


 (50, 1413) (50, 1413) (50,) (50,)


In [49]:
clf = MultinomialNB().fit(X_train_tf, y_train_1d)

In [50]:
predicted = clf.predict(X_test_tf)
np.mean(predicted == y_test_1d)

0.58

# Predicted dataframe

In [53]:
print(predicted)
# print(y_test)
# # y_test = pd.DataFrame(data=y_test, columns=['y_true'])
# y_test.reset_index(drop=True, inplace=True)
# print(y_test)


# res = pd.concat([y_pred, y_test], axis=1)

[ 1  1  1  1 -1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1 -1  1  1  1 -1  1 -1 -1  1 -1  1  1  1  1  1 -1  1 -1  1  1  1  1  1
  1  1]


In [21]:
y_test.head(20)
y_test.tail(20)

24980    1
24981    0
24982    1
24983    1
24984    0
24985    0
24986    0
24987    0
24988    1
24989    1
24990    1
24991    1
24992    1
24993    0
24994    0
24995    0
24996    0
24997    0
24998    0
24999    0
Name: sentiment, dtype: int64

In [22]:
res = pd.concat([y_test, y_pred], axis=1)
print(res.tail(25))

       sentiment  y_pred
24975          0       0
24976          1       1
24977          0       0
24978          0       0
24979          0       0
24980          1       1
24981          0       0
24982          1       1
24983          1       0
24984          0       0
24985          0       0
24986          0       0
24987          0       0
24988          1       0
24989          1       1
24990          1       1
24991          1       0
24992          1       1
24993          0       0
24994          0       0
24995          0       0
24996          0       0
24997          0       0
24998          0       0
24999          0       0


In [17]:
# res = pd.DataFrame(data=predicted, columns=['target'])

In [18]:
# res.to_csv('../data/processed/aclImdb/results/classifier_name.csv')