# TME 01 - Bag of Words

Auteur : LUONG Ethan
Auteur : PHAM Louis-Antoine

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import codecs
import re
import os.path
import string

import nltk
from nltk.corpus import stopwords

from functools import reduce

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

#from wordcloud import WordCloud

## Chargement des données

In [None]:
PRESIDENTS_FILENAME = './datasets/AFDpresidentutf8/corpus.tache1.learn.utf8'
MOVIES_DIRNAME = './datasets/movies/movies1000/'

# Présidents
def load_pres(fname):
    alltxts = []
    alllabs = []
    s=codecs.open(fname, 'r','utf-8') # pour régler le codage
    while True:
        txt = s.readline()
        if(len(txt))<5:
            break
        #
        lab = re.sub(r"<[0-9]*:[0-9]*:(.)>.*","\\1",txt)
        txt = re.sub(r"<[0-9]*:[0-9]*:.>(.*)","\\1",txt)
        if lab.count('M') >0:
            alllabs.append(-1)
        else: 
            alllabs.append(1)
        alltxts.append(txt)
    return alltxts,alllabs

# Films
def load_movies(path2data): # 1 classe par répertoire
    alltxts = [] # init vide
    labs = []
    cpt = 0
    for cl in os.listdir(path2data): # parcours des fichiers d'un répertoire
        for f in os.listdir(path2data+cl):
            txt = open(path2data+cl+'/'+f).read()
            alltxts.append(txt)
            labs.append(cpt)
        cpt+=1 # chg répertoire = cht classe
        
    return alltxts,labs

In [None]:
movtxts, movlabs = load_movies(MOVIES_DIRNAME)

print(f'Data size: {len(movtxts)}, {len(movlabs)}')

print(f'{movtxts[0][-100:]} {movlabs[0]}')
print(f'{movtxts[-1][-100:]} {movlabs[-1]}')

classes, counts = np.unique(movlabs, return_counts=True)
print(f'Class 0: {counts[0]} examples, Class 1: {counts[1]} examples')

## Pré-traitement des données textuelles

In [None]:
def preprocess(text):
    # Lower case
    text = text.lower()

    # Remove punctuation
    punc = string.punctuation + '\n\t\r'
    text = text.translate(str.maketrans(punc, ' ' * len(punc)))

    # Remove numbers
    text = re.sub('[0-9]+', '', text)

    # Stemming
    ps = nltk.stem.PorterStemmer()
    text = reduce(lambda x, y: x + " " + ps.stem(y), text.split(' '), "").strip()

    # Remove duplicate whitespaces
    text = ' '.join(text.split())

    return text

In [None]:
print(f'Before preprocessing:\n{movtxts[100][:100]}\n')
print(f'After preprocessing:\n{preprocess(movtxts[100][:100])}')

In [None]:
#nltk.download('stopwords')
stopwords_list = stopwords.words('english') + stopwords.words('french')

stopwords = []
for stopword in stopwords_list:
    stopwords.append(preprocess(stopword))

#processing on stopwords
ps = nltk.stem.PorterStemmer()
stopwords_stem = [ps.stem(x) for x in stopwords]

## Exploration des données

Reste :
<ul>
    <li>voc original</li>
    <li>fréq. documentaire</li>
    <li>odd ratio</li>
    <li>n grams</li>
</ul>

In [None]:
def get_vocabulary(alltxts):
    V = {}
    for text in alltxts:
        for word in text.split():
            V[word] = 1
    return V

print(f'|V| = {len(get_vocabulary(movtxts).keys())}')

In [None]:
plt.subplots(1, 2, figsize=(15, 12))

plt.subplot(1, 2, 1)
wordcloud = WordCloud(background_color='white', max_words=100, stopwords=stopwords_list).generate(" ".join(prestxts))
plt.imshow(wordcloud)
plt.title('Presidents Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
wordcloud = WordCloud(background_color='white', max_words=100, stopwords=stopwords_list).generate(" ".join(movtxts))
plt.imshow(wordcloud)
plt.title('Movies Word Cloud')
plt.axis('off')

plt.show()

In [None]:
# Zipf
vectorizer_tfidf = TfidfVectorizer(preprocessor=preprocess, stop_words=stopwords)
X_tfidf = vectorizer_tfidf.fit_transform(movtxts)
indices = np.argsort(-X_tfidf.toarray().sum(axis=0))
plt.plot(X_tfidf.toarray()[:, indices[:50]].sum(axis=0))
plt.xticks(range(50), vectorizer_tfidf.get_feature_names_out()[indices[:50]], rotation=90)
plt.show()

## Modèles de Machine Learning

Reste :
<ul>
    <li>optimisation mindf et maxdf</li>
    <li>bigrammes et trigrammes</li>
    <li>comparaisons subplots</li>
    <li>avec/sans équilibrage des classes pour les présidents</li>
</ul>

In [None]:
# Problèmes de convergence en régression logistique avec CountVectorizer, performances similaires
# vectorizer_count = CountVectorizer(preprocessor=preprocess, stop_words=stopwords)
# X_count = vectorizer_count.fit_transform(movtxts)

vectorizer_tfidf = TfidfVectorizer(preprocessor=preprocess, stop_words=stopwords_stem)
X_tfidf = vectorizer_tfidf.fit_transform(movtxts)

In [None]:
def test_cross_val(X, labels, cv=5, seed=0):
    np.random.seed(seed)
    accuracies = []
    roc_auc_scores = []
    f1_scores = []

    # Naive Bayes
    nb_clf = MultinomialNB()
    accuracies.append(cross_val_score(nb_clf, X, labels, cv=cv).mean())
    roc_auc_scores.append(cross_val_score(nb_clf, X, labels, cv=cv, scoring="roc_auc").mean())
    f1_scores.append(cross_val_score(nb_clf, X, labels, cv=cv, scoring="f1").mean())
    
    # Linear SVM
    svm_clf = LinearSVC(random_state=0, dual='auto')
    accuracies.append(cross_val_score(svm_clf, X, labels, cv=cv).mean())
    roc_auc_scores.append(cross_val_score(svm_clf, X, labels, cv=cv, scoring="roc_auc").mean())
    f1_scores.append(cross_val_score(svm_clf, X, labels, cv=cv, scoring="f1").mean())

    # Logistic Regression
    t = 1e-8
    C = 10.0
    lr_clf = LogisticRegression(random_state=0, solver='liblinear', max_iter=100, tol=t, C=C,class_weight="balanced")
    accuracies.append(cross_val_score(lr_clf, X, labels, cv=cv).mean())
    roc_auc_scores.append(cross_val_score(lr_clf, X, labels, cv=cv, scoring="roc_auc").mean())
    f1_scores.append(cross_val_score(lr_clf, X, labels, cv=cv, scoring="f1").mean())

    x_axis = np.arange(3)
    width = 0.2
    gap = width + 0.05
    plt.figure()
    plt.title('Cross Validation Scores')
    acc_bar = plt.bar(x_axis - gap, accuracies, width=width, label='Accuracy', color='steelblue')
    roc_auc_bar = plt.bar(x_axis, roc_auc_scores, width=width, label='ROC AUC', color='mediumseagreen')
    f1_bar = plt.bar(x_axis + gap, f1_scores, width=width, label='F1 Score', color='lightcoral')
    plt.xticks(x_axis, ['NB', 'SVM', 'LR'])
    for rect in acc_bar + roc_auc_bar + f1_bar:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width() / 2.0, height, f'{height:.2f}', ha='center', va='bottom')
    plt.legend(loc='lower right')
    plt.show()
    np.random.seed()

test_cross_val(X_tfidf, movlabs)

In [None]:
def test_train_test_split(X, labels, test_size=0.2, seed=0):
    np.random.seed(seed)
    X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=test_size)

    accuracies = []
    roc_auc_scores = []
    f1_scores = []

    # Naive Bayes
    nb_clf = MultinomialNB()
    nb_clf.fit(X_train, Y_train)
    Y_pred = nb_clf.predict(X_test)
    accuracies.append(accuracy_score(Y_test, Y_pred))
    roc_auc_scores.append(roc_auc_score(Y_test, Y_pred))
    f1_scores.append(f1_score(Y_test, Y_pred))

    # Linear SVM
    svm_clf = LinearSVC(random_state=0, dual='auto')
    svm_clf.fit(X_train, Y_train)
    Y_pred = svm_clf.predict(X_test)
    accuracies.append(accuracy_score(Y_test, Y_pred))
    roc_auc_scores.append(roc_auc_score(Y_test, Y_pred))
    f1_scores.append(f1_score(Y_test, Y_pred))

    # Logistic Regression
    t = 1e-8
    C = 100.0
    lr_clf = LogisticRegression(random_state=0, solver='liblinear', max_iter=100, tol=t, C=C)
    lr_clf.fit(X_train, Y_train)
    Y_pred = lr_clf.predict(X_test)
    accuracies.append(accuracy_score(Y_test, Y_pred))
    roc_auc_scores.append(roc_auc_score(Y_test, Y_pred))
    f1_scores.append(f1_score(Y_test, Y_pred))

    x_axis = np.arange(3)
    width = 0.2
    gap = width + 0.05
    plt.figure()
    plt.title('Train-Test Split Scores')
    acc_bar = plt.bar(x_axis - gap, accuracies, width=width, label='Accuracy', color='steelblue')
    roc_auc_bar = plt.bar(x_axis, roc_auc_scores, width=width, label='ROC AUC', color='mediumseagreen')
    f1_bar = plt.bar(x_axis + gap, f1_scores, width=width, label='F1 Score', color='lightcoral')
    plt.xticks(x_axis, ['NB', 'SVM', 'LR'])
    for rect in acc_bar + roc_auc_bar + f1_bar:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width() / 2.0, height, f'{height:.2f}', ha='center', va='bottom')
    plt.legend(loc='lower right')
    plt.show()
    np.random.seed()

test_train_test_split(X_tfidf, movlabs)

In [None]:
#binary
vectorizer_tfidf_bin = TfidfVectorizer(preprocessor=preprocess, stop_words=stopwords_stem, binary=True)
X_tfidf_bin = vectorizer_tfidf_bin.fit_transform(movtxts)

In [None]:
test_cross_val(X_tfidf_bin, movlabs)
test_train_test_split(X_tfidf_bin, movlabs)

In [None]:
def export_data(X, alllabs, tests):
    np.random.seed(0)
    svm_clf = LinearSVC(random_state=0, dual='auto')
    svm_clf.fit(X, alllabs)
    pred = svm_clf.predict(tests)
    with open('results.txt', 'w') as f:
        for yhat in pred:
            if yhat == 1:
                f.write('P\n')
            else:
                f.write('N\n')

with open('./datasets/movies/testSentiment.txt', 'r', encoding='utf-8') as f:
    movtests = f.readlines()

X_test = vectorizer_tfidf_bin.transform(movtests)
export_data(X_tfidf_bin, movlabs, X_test)

In [None]:
# Test n-grams avec binaire
vectorizer_tfidf = TfidfVectorizer(preprocessor=preprocess, stop_words=stopwords_stem,ngram_range=(1,2),binary=True)
X_tfidf = vectorizer_tfidf.fit_transform(movtxts)
print(X_tfidf.shape)
test_train_test_split(X_tfidf, movlabs)
test_cross_val(X_tfidf, movlabs)

In [None]:
def export_data(X, alllabs, tests):
    np.random.seed(0)
    svm_clf = LinearSVC(random_state=0, dual='auto')
    svm_clf.fit(X, alllabs)
    pred = svm_clf.predict(tests)
    with open('results.txt', 'w') as f:
        for yhat in pred:
            if yhat == 1:
                f.write('P\n')
            else:
                f.write('N\n')

with open('./datasets/movies/testSentiment.txt', 'r', encoding='utf-8') as f:
    movtests = f.readlines()

X_test = vectorizer_tfidf.transform(movtests)
export_data(X_tfidf, movlabs, X_test)

## Dataset Présidents

In [None]:
#pré-traitement pour les documents des discours
def preprocess_pres(text):


    # Remove punctuation
    punc = string.punctuation + '\n\t\r'
    text = text.translate(str.maketrans(punc, ' ' * len(punc)))

    # Remove numbers
    text = re.sub('[0-9]+', '', text)

    # Stemming
    ps = nltk.stem.SnowballStemmer("french")
    text = reduce(lambda x, y: x + " " + ps.stem(y), text.split(' '), "").strip()

    # Remove duplicate whitespaces
    text = ' '.join(text.split())

    return text

In [None]:
prestxts, preslabs = load_pres(PRESIDENTS_FILENAME)

print(f'Data size: {len(prestxts)}, {len(preslabs)}')

print(f'{prestxts[0]} {preslabs[0]}')
print(f'{prestxts[-1]} {preslabs[-1]}')

classes, counts = np.unique(preslabs, return_counts=True)
print(f'Miterrand: {counts[0]} examples, Chirac: {counts[1]} examples')

print(f'Before preprocessing:\n{prestxts[100][:100]}\n')
print(f'After preprocessing:\n{preprocess_pres(prestxts[100][:100])}')

In [None]:
ps = nltk.stem.SnowballStemmer("french")
stopwords_stem = [ps.stem(x) for x in stopwords]

In [None]:
#vectorizer tfidf

vectorizer_tfidf = TfidfVectorizer(preprocessor=preprocess_pres, stop_words=stopwords_stem)

vectorizer_tf = TfidfVectorizer(preprocessor=preprocess_pres, stop_words=stopwords_stem,use_idf=False)

vectorizer_bin = TfidfVectorizer(preprocessor=preprocess_pres, stop_words=stopwords_stem,binary=True)

vectorizer_tfidf_gram = TfidfVectorizer(preprocessor=preprocess_pres, stop_words=stopwords_stem,ngram_range=(1,3))

vectorizer_tfidf_gram_max_df = TfidfVectorizer(preprocessor=preprocess_pres, stop_words=stopwords_stem,ngram_range=(1,3),max_df=2000,max_features=200000)

X_tfidf = vectorizer_tfidf.fit_transform(prestxts)
X_tf = vectorizer_tf.fit_transform(prestxts)
X_bin = vectorizer_bin.fit_transform(prestxts)
X_gram = vectorizer_tfidf_gram.fit_transform(prestxts)
X_gram_max = vectorizer_tfidf_gram_max_df.fit_transform(prestxts)

#TF-IDF
test_cross_val(X_tfidf,preslabs)
#TF
test_cross_val(X_tf,preslabs)
#binaire
test_cross_val(X_bin,preslabs)
#n_gram 1-3
test_cross_val(X_gram,preslabs)

#n-gram 1-3 avec réduction des features avec 200000
test_cross_val(X_gram_max,preslabs)

In [None]:
#balancement des données
idx_mitterrand = np.argwhere(np.array(preslabs) == -1)

txt_mitterrand = [prestxts[int(i)] for i in idx_mitterrand]

txt_mitterrand_dup = txt_mitterrand*6

In [None]:
#on duplique les données de la classe minoritaire pour faire le balancement des données
print(len(txt_mitterrand_dup))

prestxts_dup = prestxts + txt_mitterrand_dup
preslabs_dup = preslabs + list(-np.ones(len(txt_mitterrand_dup),dtype = int))



In [None]:
X_tfidf_dup = vectorizer_tfidf.fit_transform(prestxts_dup)
X_tf_dup = vectorizer_tf.fit_transform(prestxts_dup)
X_bin_dup = vectorizer_bin.fit_transform(prestxts_dup)
X_gram_dup = vectorizer_tfidf_gram.fit_transform(prestxts_dup)


test_cross_val(X_tfidf_dup,preslabs_dup)
test_cross_val(X_tf_dup,preslabs_dup)
test_cross_val(X_bin_dup,preslabs_dup)
test_cross_val(X_gram_dup,preslabs_dup)

### Post-traitement

In [None]:
def gaussian_kernel(size):
    sigma = size/3
    x = np.arange(-size, size+1)
    kern =  np.exp(-(x**2)/(2*sigma*sigma))
    return kern / kern.sum()

def gaussian_smoothing(pred, size):
    predictions = np.copy(pred)
    kernel = gaussian_kernel(size)
    return np.convolve(predictions, kernel, mode='same')

In [None]:
plt.figure()
plt.plot(list(range(len(preslabs[0:100]))), preslabs[0:100])
plt.show()

In [None]:
plt.figure()
plt.plot(list(range(len(preslabs[0:100]))), gaussian_smoothing(preslabs[0:100], 1))
plt.show()

In [None]:
#export des prédiction pour les discours
def export_data_pres(X, alllabs, tests,clf):
    np.random.seed(0)
    clf.fit(X, alllabs)
    pred = clf.predict_proba(tests)
    pred = gaussian_smoothing(pred[:,0],3) #on fait le post-traitement dans l'export
    with open('results_pres.txt', 'w') as f:
        for yhat in pred:
            f.write(f'{yhat}\n')


In [None]:
PRESIDENTS_FILENAME_TEST = './datasets/AFDpresidentutf8/corpus.tache1.test.utf8'

def load_pres_test(fname):
    alltxts = []
    s=codecs.open(fname, 'r','utf-8') # pour régler le codage
    while True:
        txt = s.readline()
        if(len(txt))<5:
            break
        txt = re.sub(r"<[0-9]*:[0-9]*>(.*)","\\1",txt)
        alltxts.append(txt)
    return alltxts
text_test = load_pres_test(PRESIDENTS_FILENAME_TEST)

print(text_test[0])

In [None]:
lr_clf = LogisticRegression(random_state=0, solver='liblinear', max_iter=100, tol=1e-8, C=10)
X_tfidf_test = vectorizer_tfidf_gram.transform(text_test)
export_data_pres(X_gram_dup,preslabs_dup,X_tfidf_test,lr_clf)

In [None]:
lr_clf = LogisticRegression(random_state=0, solver='liblinear', max_iter=100, tol=1e-8, C=10,class_weight="balanced")
X_tfidf_test = vectorizer_tfidf_gram_max_df.transform(text_test)
