# Team Work 3 : ...

**Authors:** CHRETIEN Jérémy, DAVIDSON Colin, LAFAGE Adrien, REMBUSCH Gabrielle and WILBRINK Aurore.

In [1]:
import nltk
from nltk.corpus import stopwords as stpw
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd
import re
import string 

import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eisti/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/eisti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/eisti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eisti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Get the dataset



In [2]:
train_set = pd.read_csv("train_dataset.csv")
test_set = pd.read_csv("test_dataset.csv")

## Preprocessing n°1

We used lemmatization to reduce words to their base form.


In [3]:
def preprocess_v1(content) : 

    # remove upper letters
    content = content.lower()
    
    # remove punctuation 
    content = content.translate(str.maketrans("","", string.punctuation))
    
    # remove isolated letters
    content = re.sub(r'<.*?>', '', content)
    content = re.sub(r'\d+ *|\b[a-z]\b *', "", content) 
    content = content.strip()
    
    tokens = word_tokenize(content)

    # lemmatization 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # clean stop words 
    stopwords = set(stpw.words("english"))
 
    # removes stopwords and duplicates
    content = " ".join(
        list(dict.fromkeys([t for t in tokens if not t in stopwords]))
    )  
    
    return content 




## Preprocessing n°2


For this preprocessing, we used tags to remove proper nouns and verbs but removed the lemmatization. 

The first step in our preprocessing is to remove proper nouns and verbs to be sure that our models won't be biaised by actors'names, countries, cities or some verbs... 

- NNP: proper noun, singular (Harrison)
- NNPS: proper noun, plural (Americans)


- VB: verb base form (take)
- VBD: verb past tense (took)
- VBG: verb gerund/present participle (taking)
- VBN: verb past participle (taken)
- VBP: verb sing. present, non-3d (take)
- VBZ: verb 3rd person sing. present (takes)



In [4]:
def preprocess_v2(content) : 
    # remove proper nouns 
    content = nltk.tag.pos_tag(content.split())
    content = [word for word,tag in content if tag != 'NNP' and tag != 'NNPS' and tag != 'VB' and tag != 'VBD' and tag != 'VBG' and tag != 'VBN' and tag != 'VBP' and tag != 'VBZ']
    content = " ".join(content)
    
    # remove upper letters
    content = content.lower()
    
    # remove punctuation 
    content = content.translate(str.maketrans("","", string.punctuation))
    
    # remove isolated letters
    content = re.sub(r'<.*?>', '', content)
    content = re.sub(r'\d+ *|\b[a-z]\b *', "", content) 
    content = content.strip()
    
    tokens = word_tokenize(content)

    # clean stop words 
    stopwords = set(stpw.words("english")) 
    stopwords = stopwords.union(['movie'])
 
    # removes stopwords and duplicates
    content = " ".join(
        list(dict.fromkeys([t for t in tokens if not t in stopwords]))
    )  
    
    return content 


## Vectorizes data, apply preprocessing methods and trains classifiers

### Statistics functions

In [5]:
def eval(test_label, predicted):
    """
    Computes confusion matrix and outputs the classifier's statistic
    Parameters
    ----------
    test_label : array-like of shape (n_samples,)

    predicted : array-like of shape (n_samples,)
    """
    tn, fp, fn, tp = confusion_matrix(test_label, predicted).ravel() # Binary case
    print(f"tn : {tn}, fp : {fp}, fn : {fn}, tp : {tp}")
    log_stats(tn, fp, fn, tp)

def log_stats(tn, fp, fn, tp):
    """
    Computes accuracy, precision, recall, f1 score
    Parameters
    ----------
    tn : int

    fp : int

    fn : int

    tp : int
    """
    # Accuracy

    acc = (tn+tp)/(tn+fp+fn+tp)
    pre = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = 2*(recall * pre) / (recall + pre)
    
    # Log

    print(f"Accuracy : {acc}")
    print(f"Precision : {pre}")
    print(f"Recall : {recall}")
    print(f"F1 Score : {f1_score}")


### Preprocessing n°1

In [6]:
start = time.time()

# use preprocessor attribute to preprocess the data, and max_features to set a boundary on the maximum number of words used to create the bag of words.
matrix = CountVectorizer(max_features=1000, preprocessor=preprocess_v1)
# learns the vocabulary dictionnary and returns document-term matrix as a Numpy Array.
X_train = matrix.fit_transform(train_set.features).toarray()
# transforms data to data-term matrix as a Numpy Array.
X_test = matrix.transform(test_set.features).toarray()
# gets train set labels.
y_train = train_set.label.to_numpy()
# gets test set labels.
y_test = test_set.label.to_numpy()

firstCVTime = time.time()
print(f"First Count Vectorizer : {firstCVTime - start}s")

First Count Vectorizer : 94.08407068252563s


### Classifiers' trainings

In [7]:
def classifier_training(estimator, X_train, y_train):
    cv_result = cross_validate(estimator, X_train, y_train, cv=5, return_estimator=True)
    return cv_result['estimator'][np.argmax(cv_result['test_score'])]

#### Naives Bayes Classifier

In [8]:
gnb = GaussianNB()
clf = classifier_training(gnb, X_train, y_train)
predicted = clf.predict(X_test)
firstNBCTime = time.time()
print(f"First Naives Bayes Clf : {firstNBCTime - firstCVTime}s")
eval(y_test, predicted)

First Naives Bayes Clf : 2.769439697265625s
tn : 10216, fp : 2284, fn : 2351, tp : 10149
Accuracy : 0.8146
Precision : 0.8162953430386873
Recall : 0.81192
F1 Score : 0.8141017928047166


#### Random Forest Classifier

In [9]:
rfc = RandomForestClassifier()
clf = classifier_training(rfc, X_train, y_train)
predicted = clf.predict(X_test)
firstRFCTime = time.time()
print(f"First Random Forest Clf : {firstRFCTime - firstNBCTime}s")
eval(y_test, predicted)

First Random Forest Clf : 59.742379665374756s
tn : 10319, fp : 2181, fn : 2256, tp : 10244
Accuracy : 0.82252
Precision : 0.8244668008048289
Recall : 0.81952
F1 Score : 0.8219859578736207


#### Support Vector Classifier

In [10]:
svc = LinearSVC()
feature_map_nystroem = Nystroem(
    random_state=1,
    n_components=1500
)
transformed_X_train = feature_map_nystroem.fit_transform(X_train)
transformed_X_test = feature_map_nystroem.transform(X_test)
clf = classifier_training(svc, transformed_X_train, y_train)
predicted = clf.predict(transformed_X_test)
firstSVCTime = time.time()
print(f"First Support Vector Clf : {firstSVCTime - firstRFCTime}s")
eval(y_test, predicted)

First Support Vector Clf : 31.922729969024658s
tn : 10462, fp : 2038, fn : 1676, tp : 10824
Accuracy : 0.85144
Precision : 0.8415487482506608
Recall : 0.86592
F1 Score : 0.8535604447598769


#### K-Nearest Neighbors Classifier

In [11]:
# knc = KNeighborsClassifier()
# clf = classifier_training(knc, X_train, y_train)
# predicted = clf.predict(X_test)
# eval(y_test, predicted)

### Preprocessing n°2

In [12]:
# use preprocessor attribute to preprocess the data, and max_features to set a boundary on the maximum number of words used to create the bag of words.
matrix = CountVectorizer(max_features=1000, preprocessor=preprocess_v2)
# learns the vocabulary dictionnary and returns document-term matrix as a Numpy Array.
X_train = matrix.fit_transform(train_set.features).toarray()
# transforms data to data-term matrix as a Numpy Array.
X_test = matrix.transform(test_set.features).toarray()
# gets train set labels.
y_train = train_set.label.to_numpy()
# gets test set labels.
y_test = test_set.label.to_numpy()

secondCVTime = time.time()
print(f"Second Count Vectorizer : {secondCVTime - firstSVCTime}s")

Second Count Vectorizer : 446.4436094760895s


### Classifiers' trainings

#### Naives Bayes Classifier

In [13]:
gnb = GaussianNB()
clf = classifier_training(gnb, X_train, y_train)
predicted = clf.predict(X_test)
secondNBCTime = time.time()
print(f"Second Naives Bayes Clf : {secondNBCTime - secondCVTime}s")
eval(y_test, predicted)

Second Naives Bayes Clf : 2.664599657058716s
tn : 10234, fp : 2266, fn : 2728, tp : 9772
Accuracy : 0.80024
Precision : 0.8117627512875893
Recall : 0.78176
F1 Score : 0.7964789306381939


#### Random Forest Classifier

In [14]:
rfc = RandomForestClassifier()
clf = classifier_training(rfc, X_train, y_train)
predicted = clf.predict(X_test)
secondRFTime = time.time()
print(f"Second Random Forest Clf : {secondRFTime - secondNBCTime}s")
eval(y_test, predicted)

Second Random Forest Clf : 68.84842467308044s
tn : 10203, fp : 2297, fn : 2374, tp : 10126
Accuracy : 0.81316
Precision : 0.8151010222973517
Recall : 0.81008
F1 Score : 0.8125827548850459


#### Support Vector Classifier

In [15]:
svc = LinearSVC()
feature_map_nystroem = Nystroem(
    random_state=1,
    n_components=1500
)
transformed_X_train = feature_map_nystroem.fit_transform(X_train)
transformed_X_test = feature_map_nystroem.transform(X_test)
clf = classifier_training(svc, transformed_X_train, y_train)
predicted = clf.predict(transformed_X_test)
secondSVCTime = time.time()
print(f"Second Support Vector Clf : {secondSVCTime - secondRFTime}s")
eval(y_test, predicted)

Second Support Vector Clf : 34.499735593795776s
tn : 10295, fp : 2205, fn : 1747, tp : 10753
Accuracy : 0.84192
Precision : 0.829834851057262
Recall : 0.86024
F1 Score : 0.8447639248959069


#### K-Nearest Neighbors Classifier

In [16]:
# knc = KNeighborsClassifier()
# clf = classifier_training(knc, X_train, y_train)
# predicted = clf.predict(X_test)
# eval(y_test, predicted)