In [24]:
import os
for dirname, _, filenames in os.walk('/Users/donor/PycharmProjects/Reviews/tuto_data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/Users/donor/PycharmProjects/Reviews/tuto_data/test.csv
/Users/donor/PycharmProjects/Reviews/tuto_data/train.csv
/Users/donor/PycharmProjects/Reviews/tuto_data/sample_submission.csv


In [25]:
import numpy as np 
import pandas as pd

In [26]:
#READING INPUT
data = pd.read_csv("/Users/donor/PycharmProjects/Reviews/tuto_data/train.csv")
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


we map "EAP" to 0 "HPL" to 1 and "MWS" to 2 as it will be more convenient for our classifier. 
In other words we are just telling our computer that if classifier predicts 0 for the text then it means that it is preicting "EAP", if 1 then it means that it is predicting "HPL", if 2 then it means that it is predicting "MWS".

In [27]:
data['author_num'] = data["author"].map({'EAP':0, 'HPL':1, 'MWS':2})
data.head()

Unnamed: 0,id,text,author,author_num
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


## Define X and y

In [28]:
X = data['text']
y = data['author_num']

## Split training and test data

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Vectorisation

#### Count Vectorizer: builds a dictionary of features and transforms documents to feature vectors.



In [30]:
from sklearn.feature_extraction.text import CountVectorizer

* example
* below: the word "life" has been found 2 times in sentence 0 and in sentence 1
* the word paul has been found 1 time in sentence 0 and 0 times in sentence 1
* and so on...

In [10]:
text=["My name is Paul my life is Jane! And we live our life together" , "My name is Guido my life is Victoria! And we live our life together"]
toy = CountVectorizer(stop_words = 'english')
toy.fit_transform(text)
matrix = toy.transform(text)
features = toy.get_feature_names()
df_res = pd.DataFrame(matrix.toarray(), columns=features)
df_res



Unnamed: 0,guido,jane,life,live,paul,victoria
0,0,1,2,1,1,0
1,1,0,2,1,0,1


In [31]:
vect = CountVectorizer(stop_words = 'english')

In [32]:
X_train_matrix = vect.fit_transform(X_train)
X_train_matrix

<13705x21557 sparse matrix of type '<class 'numpy.int64'>'
	with 154054 stored elements in Compressed Sparse Row format>

## Model 1 with count vectorizer

In [33]:
from sklearn.naive_bayes import MultinomialNB

clf=MultinomialNB()
clf.fit(X_train_matrix, y_train)
print(clf.score(X_train_matrix, y_train))

X_test_matrix = vect.transform(X_test) 
print (clf.score(X_test_matrix, y_test))

0.9173294418095586
0.8207354443309499


In [15]:
predicted_result=clf.predict(X_test_matrix)
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_result))

              precision    recall  f1-score   support

           0       0.83      0.79      0.81      2353
           1       0.84      0.82      0.83      1694
           2       0.79      0.85      0.82      1827

    accuracy                           0.82      5874
   macro avg       0.82      0.82      0.82      5874
weighted avg       0.82      0.82      0.82      5874



#### Tf-idf: 

* Since longer documents will have higher average count values than shorter documents, even though they might talk about the same topics, we can divide the number of occurrences of each word in a document by the total number of words in the document: **tf** for Term Frequencies.

* **idf** for “Term Frequency times Inverse Document Frequency” : Downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.

* CountVectorizer and TfidTransformer steps into one using [TfidVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html):

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')

X_train_tfidf = vectorizer.fit_transform(X_train) 
X_train_tfidf.shape

(13705, 21564)

## Model 2 with TfidVectorizer

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf2=MultinomialNB()
clf2.fit(X_train_tfidf, y_train)
print(clf2.score(X_train_tfidf, y_train))
X_test_tfidf = vectorizer.transform(X_test) 
print (clf2.score(X_test_tfidf, y_test))

0.9161619846771252
0.8094994892747702


* it doesn't perform better in term of accuracy

In [18]:
predicted_result_2=clf2.predict(X_test_tfidf)
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_result_2))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81      2353
           1       0.90      0.72      0.80      1694
           2       0.84      0.80      0.82      1827

    accuracy                           0.81      5874
   macro avg       0.83      0.80      0.81      5874
weighted avg       0.82      0.81      0.81      5874



* there might be something to learn from the predictions on class 2

# Submission

In [20]:
sample = pd.read_csv("/Users/donor/PycharmProjects/Reviews/tuto_data/sample_submission.csv")
sample.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


In [21]:
test = pd.read_csv("/Users/donor/PycharmProjects/Reviews/tuto_data/test.csv")
test_matrix = vect.transform(test["text"])
predicted_result = clf.predict_proba(test_matrix)

In [22]:
result=pd.DataFrame()
result["id"]=test["id"]
result["EAP"]=predicted_result[:,0]
result["HPL"]=predicted_result[:,1]
result["MWS"]=predicted_result[:,2]
result.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.004487,0.000462,0.9950503
1,id24541,0.999986,1.3e-05,1.200479e-06
2,id00134,0.181945,0.817808,0.0002470055
3,id27757,0.235618,0.764382,3.242965e-07
4,id04081,0.961043,0.031715,0.007242041


In [23]:
result.to_csv("submission_v1.csv", index=False)