In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import spacy

# better display of review text in dataframes
pd.set_option('display.max_colwidth', None) 

# Seaborn options
sns.set(style="whitegrid", font_scale=1.4)

%load_ext autoreload
%autoreload 2

In [None]:
# !python3 -m spacy download fr_core_news_sm
spacy.load("fr_core_news_sm")

In [None]:
nlp = spacy.load("fr_core_news_sm")
from nltk.corpus import stopwords
from nltk import download
download("stopwords")
stopWords = set(stopwords.words("french"))
print(stopWords)

# Load data

## Dataset

In [None]:
# import pickle

# PICKLE_PATH = "allocine_dataset/data/allocine_dataset.pickle"

# with open(PICKLE_PATH, 'rb') as reader:
#     data = pickle.load(reader)

# X_train, y_train = np.array(data["train_set"]['review']), np.array(data["train_set"]['polarity'])
# X_val, y_val = np.array(data["val_set"]['review']), np.array(data["val_set"]['polarity'])
# X_test, y_test = np.array(data["test_set"]['review']), np.array(data["test_set"]['polarity'])
# class_names = data['class_names']

# print("LEN TRAIN: "+ str(len(X_train)))
# print("LEN VAL: "+ str(len(X_val)))
# print("LEN TEST: "+ str(len(X_test)))

In [None]:
import string

def preprocess(sentence) :
    list_w = nlp(sentence)
    list_w_clean = []
    res = []
    for token in list_w:
        if (token.text.lower() not in stopWords) and (token.text not in string.punctuation):
            list_w_clean.append(token)
    for token in list_w_clean:
        res.append(token.lemma_.lower())
    return " ".join(res)

In [None]:
# Données d'entrainement
train_data_complete = pd.read_csv("../data/allocine_genres_train.csv", sep=",")
train_data = train_data_complete[["titre", "synopsis", "genre"]]

X = train_data.drop('genre', axis=1)
y = train_data['genre']

# Initialize the RandomOverSampler
ros = RandomOverSampler()
#ros = RandomUnderSampler()

X["titre"] = X["titre"].apply(preprocess)
X["synopsis"] = X["synopsis"].apply(preprocess)

# Perform oversampling
X_resampled, y_resampled = ros.fit_resample(X, y)

# Convert the resampled data back to a DataFrame
X_train = pd.DataFrame(X_resampled, columns=X.columns)
X_train = X_train["synopsis"]
y_train = pd.DataFrame(y_resampled)

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.2,
                                                    random_state=12, # Random seed for shuffle
                                                    shuffle=True)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
class_names = train_data['genre']

# Model selection

## First model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

tfidf_clf = Pipeline([
    ('tfidf', TfidfVectorizer()), # Default parameters
    ('clf', LogisticRegression(n_jobs=-1, verbose=1)),
])

tfidf_clf.fit(X_train, y_train)

## Prediction

In [None]:
some_index = 0
some_review = X_val[some_index]
print(some_review)
print()
print("True Polarity:", class_names[y_val[some_index]])

In [None]:
some_prediction = tfidf_clf.predict([some_review, ])
print("Predicted Polarity:", class_names[some_prediction[0]]) # Good prediction !

In [None]:
from sklearn import metrics

# Predicting training dataset
y_pred = tfidf_clf.predict(X_train)
print("Training Accuracy:", metrics.accuracy_score(y_train, y_pred))

# Predicting with a test dataset

y_pred = tfidf_clf.predict(X_val)
print("Validation Accuracy:", metrics.accuracy_score(y_val, y_pred))
print()
print(metrics.classification_report(y_val, y_pred, target_names=class_names.values()))

## Grid Search

In [None]:
from sklearn.model_selection import PredefinedSplit

# Grid-search on validation set.
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)
validation_indexes = [-1]*len(X_train) + [0]*len(X_val)
ps = PredefinedSplit(test_fold=validation_indexes)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
     'tfidf__lowercase': (True, False),
     'tfidf__ngram_range': [(1, 1), (1, 2)],
     'tfidf__max_df': [0.60, 0.65, 0.70, 0.75, 0.85, 1],
     'clf__C': np.logspace(-4, 4, 10),
}

tfidf_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(penalty='l2', n_jobs=-1, verbose=1)),
])

grid_search = GridSearchCV(
    tfidf_clf, param_grid, cv=ps, 
    scoring='accuracy', return_train_score=True, 
    n_jobs=-1, verbose=1
)

grid_search.fit(X, y)

In [None]:
print(grid_search.best_params_) # pprint ?
print(grid_search.best_score_)

best_clf = grid_search.best_estimator_

Thanks to grid search,  validation accuracy is now ~2 percents higher than before !

# Training best model

In [None]:
best_clf = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True, ngram_range=(1, 2),
        max_df=0.75
    )),
    ('clf', LogisticRegression(
        C=1300, penalty='l2', 
        n_jobs=-1, verbose=1
    )),
])

best_clf.fit(X_train, y_train)

## Results

In [None]:
from sklearn import metrics 

y_pred = best_clf.predict(X_val)

print("Val Accuracy: {:.2f}".format(100 * metrics.accuracy_score(y_val, y_pred)))
print("Val F1-Score: {:.2f}".format(100 * metrics.f1_score(y_val, y_pred)))
print()

report = metrics.classification_report(
    y_val, y_pred, 
    target_names=class_names.values()
)
print(report)

## Learning curves

In [None]:
from sklearn.model_selection import learning_curve

# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(
    best_clf, X, y, cv=ps, 
    scoring='accuracy',n_jobs=-1,verbose=1,
    # 50 different sizes of the training set
    train_sizes=np.linspace(0.01, 1.0, 50)
)

In [None]:
#import matplotlib.pyplot as plt
from sklearn.datasets import load_digits

def plot_learning_curves(train_sizes, train_scores, test_scores, figsize=(10,7), fontsize=14):
    # Create means and standard deviations of training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    
    # Create means and standard deviations of test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    fig = plt.figure(figsize=figsize)
    
    # Draw lines
    plt.plot(train_sizes, train_mean, '--', color="r",  label="Training score")
    plt.plot(train_sizes, test_mean, color="g", label="Validation score")

    # Draw bands
    #plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
    #plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD") 

    # Create plot
    plt.title("Learning Curve")
    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy Score")
    plt.legend(loc="lower right")  
    
       
    return fig

In [None]:
fig = plot_learning_curves(train_sizes, train_scores, test_scores)

# Finalize the plot
sns.despine(bottom=True)
plt.tight_layout(h_pad=2)

# Saving plot
fig.savefig('img/tf-idf/learning_curves.png', dpi=200)

We can see clearly that the training score is still around the maximum and the validation score could be increased with more training samples.

## Error analysis

### Confusion matrix

In [None]:
from utils import print_confusion_matrix
from sklearn.metrics import confusion_matrix

conf_mx = confusion_matrix(y_val, y_pred)

fig = print_confusion_matrix(
    conf_mx, 
    class_names.values(), 
    figsize=(7,5)
)

# Finalize the plot
sns.despine(bottom=True)
plt.tight_layout(h_pad=2)

# Saving plot
fig.savefig('img/tf-idf/val_confusion_mx.png', dpi=200)

### False Positive / Negative

In [None]:
false_pos = X_val[(y_val == 0) & (y_pred == 1)]
false_neg = X_val[(y_val == 1) & (y_pred == 0)]

In [None]:
pd.DataFrame(false_pos[:5])

In [None]:
pd.DataFrame(false_neg[:5])

## Save

In [None]:
import pickle

with open('data/tf-idf/best_clf.pickle', 'wb') as f:
    pickle.dump(best_clf, f)

# Testing best model

In [None]:
import pickle

with open('data/tf-idf/best_clf.pickle', 'rb') as f:
    best_clf = pickle.load(f)

In [None]:
from sklearn import metrics 

y_pred = best_clf.predict(X_test)

print("Test Accuracy: {:.2f}".format(100 * metrics.accuracy_score(y_test, y_pred)))
print("Test F1-Score: {:.2f}".format(100 * metrics.f1_score(y_test, y_pred)))
print()

report = metrics.classification_report(
    y_test, y_pred, 
    target_names=class_names.values()
)
print(report)

In [None]:
from utils import print_confusion_matrix
from sklearn.metrics import confusion_matrix

conf_mx = confusion_matrix(y_test, y_pred)

fig = print_confusion_matrix(
    conf_mx, 
    class_names.values(), 
    figsize=(7,5)
)

# Finalize the plot
sns.despine(bottom=True)
plt.tight_layout(h_pad=2)

# Saving plot
fig.savefig('img/tf-idf/test_confusion_mx.png', dpi=200)

## Accuracy vs Training data

In [None]:
from sklearn import metrics 

sizes = [1000, 5000, 10000, 20000, 40000, 80000, 120000, 160000]

def accuracy_vs_train_size(model, X_train, y_train, X_test, y_test, sizes):    
    test_accuracies = []
    for size in sizes:
        # Train model on data subset
        model.fit(X_train[:size], y_train[:size])
        y_pred = model.predict(X_test)
        
        # Evaluate on test set
        test_acc = metrics.accuracy_score(y_test, y_pred)
        test_accuracies.append(test_acc)

    return test_accuracies

In [None]:
test_accuracies = accuracy_vs_train_size(
    best_clf, X_train, y_train,
    X_test, y_test, sizes
)

In [None]:
import pickle

OUTPUT_PATH = 'data/tf-idf/tfidf_accuracies.pickle'

output_dict = {
    "sizes": sizes,
    "test_accuracies": test_accuracies
}

with open(OUTPUT_PATH, 'wb') as writer:
    pickle.dump(output_dict, writer)

## Inference time

In [None]:
import pickle

with open('data/tf-idf/best_clf.pickle', 'rb') as f:
    best_clf = pickle.load(f)

In [None]:
import time

inference_times = []

for i in range(1000):
    x = np.array([X_test[i], ])
    start_time = time.time()
    y_pred = best_clf.predict(x)
    stop_time = time.time()
    
    inference_times.append(stop_time - start_time)    

In [None]:
OUTPUT_PATH = 'data/tf-idf/tfidf_times.pickle'

with open(OUTPUT_PATH, 'wb') as writer:
    pickle.dump(inference_times, writer)

## Generalizability

In [None]:
import pickle

with open('data/tf-idf/best_clf.pickle', 'rb') as f:
    best_clf = pickle.load(f)

In [None]:
import os
from utils_acl import get_data

ACL_FOLDER = 'data/cls-acl10-unprocessed/fr'
BOOKS_FOLDER = os.path.join(ACL_FOLDER, 'books')
DVD_FOLDER = os.path.join(ACL_FOLDER, 'dvd')
MUSIC_FOLDER = os.path.join(ACL_FOLDER, 'music')

_, _, X_test_b, y_test_b = get_data(BOOKS_FOLDER)
_, _, X_test_d, y_test_d = get_data(DVD_FOLDER)
_, _, X_test_m, y_test_m = get_data(MUSIC_FOLDER)

In [None]:
from sklearn import metrics 

def evaluate(model, X, y):
    y_pred = model.predict(X)
    print("Accuracy: {:.2f}".format(100 * metrics.accuracy_score(y, y_pred)))
    print("F1-Score: {:.2f}".format(100 * metrics.f1_score(y, y_pred)))

### Books

In [None]:
evaluate(best_clf, X_test_b, y_test_b)

### DVD

In [None]:
evaluate(best_clf, X_test_d, y_test_d)

### Music

In [None]:
evaluate(best_clf, X_test_m, y_test_m)