PHO Vinh-Son 3802052 <br>
CHOI Esther 3800370

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import codecs
import re
import os.path

import sklearn.naive_bayes as nb
from sklearn import svm
from sklearn import linear_model as lin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import string

#df,mf,b,smooth = (0.18, 10000, 0.134, [1, 2, 3, 4, 5, 6, 7, 8, 2])

def conf(Y_Test,Y_hat):
    confusion = np.zeros((2,2))
    Y_hat = (Y_hat+1)//2
    Y_Test = (Y_Test+1)//2
    for i in range(Y_Test.size):
        confusion[Y_Test[i],Y_hat[i]] +=1
    return confusion
    
def f1(conf):
    tp = conf[0,0]
    false = conf[0,1] + conf[1,0]
    return (tp/(tp+0.5*false))

def load_movies(dirname):
    """
    str -> list(str) * list(str) * list(str)
    dirname = contient le dossier des avis positifs et negatifs (supposés appelés pos et neg)
        et le fichier test supposé appelé "testSentiment.txt"
    Retourne la liste des avis négatifs, la liste des avis positifs et la liste test
    """
    postxts = []
    negtxts = []
    testtxts = []
    
    punc = string.punctuation  # recupération de la ponctuation
    punc += '\n\r\t'
    
    # avis positifs
    dirpos = dirname+"/pos"
    for filename in os.listdir(dirpos):
        fpath = os.path.join(dirpos, filename) #dirname/pos/filename
        f = codecs.open(fpath, 'r','utf-8') # pour régler le codage
        t = f.read()
        #t = t.translate(str.maketrans(punc, ' ' * len(punc))) #suppression de la ponctuation
        postxts.append(t)
        f.close()
        
    # avis négatifs
    dirneg = dirname+"/neg"
    for filename in os.listdir(dirneg):
        fpath = os.path.join(dirneg, filename) #dirname/pos/filename
        f = codecs.open(fpath, 'r','utf-8') # pour régler le codage
        t = f.read()
        #t = t.translate(str.maketrans(punc, ' ' * len(punc))) #suppression de la ponctuation
        negtxts.append(t)
        f.close()
    
    # fichier test
    ftest = dirname+"/testSentiment.txt"
    f = codecs.open(ftest, 'r','utf-8') # pour régler le codage        
    testtxts = f.readlines()
    
    while True:
        t = f.readline()
        if not t: # empty line = eof
            break
        #t = t.translate(str.maketrans(punc, ' ' * len(punc))) #suppression de la ponctuation
        testtxts.append(t)
    
    return postxts,negtxts,testtxts

def load_pres_T(fname):
    alltxts_T = []
    s=codecs.open(fname, 'r','utf-8') # pour régler le codage
    while True:
        txt = s.readline()
        if(len(txt))<5:
            break
        txt = re.sub(r"<[0-9]*:[0-9]*>(.*)","\\1",txt)
        alltxts_T.append(txt)
    return alltxts_T

def crossval(alltxts,alllabs,cv = 5,shuffle = False, seed = 42):
    ind = np.array([i for i in range(alllabs.size)])
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(ind)
    splitind = np.array_split(ind, cv)
    lcrossval = []
    for indices in splitind:
        lcrossval.append((alltxts[indices],np.delete(alltxts,indices),alllabs[indices],np.delete(alllabs,indices)))
    return lcrossval
    

## Mise en forme des données

In [2]:
def trainset(postxts, negtxts, mode):
    """
    list(str) * list(str) -> list(str) * list(str)
    mode = concat, blocs, random
    Retourne l'ensemble de train défini selon le mode passé en paramètre
    """
    if mode == "concat":
        alltxts = postxts + negtxts
        alllabs = [1 for i in range(len(postxts))] + [-1 for i in range(len(negtxts))]
    
    elif mode == "blocs":
        n = int((len(postxts)//10)*2) #nombre de blocs de 10
        alltxts = []
        alllabs = []
        for i in range(n):
            if i%2 == 0:
                alltxts = alltxts + postxts[i:i+10]
                alllabs = alllabs + [1 for j in range(10)]
            else:
                alltxts = alltxts + negtxts[i:i+10]
                alllabs = alllabs + [-1 for j in range(10)]
                
    elif mode == "random":
        t = postxts + negtxts
        l = [1 for i in range(len(postxts))] + [-1 for i in range(len(negtxts))]
        tmp = np.array([t,l])
        np.random.shuffle(tmp.T)
        alltxts = tmp[0,:]
        alllabs = tmp[1,:].astype(int)
                
    else:
        print("mode invalide")
        return
    
    return np.array(alltxts), np.array(alllabs)

## Prédiction

In [3]:
def score_pred_param(df,mf,fname = "movies1000", trainsetmode = "random",cv_fold = 5,vectorizer = TfidfVectorizer, classifier = lin.LogisticRegression):
    postxts,negtxts,testtxts = load_movies(fname)
    alltxts, alllabs = trainset(postxts,negtxts,trainsetmode)
    
    lcrossval = crossval(alltxts,alllabs,cv = cv_fold,shuffle = True, seed = 42)
    liste_score_bonneclassif = []
    
    for X_Test, X_Train, Y_Test, Y_Train in lcrossval:

        #init
        vectorizer2 = vectorizer(analyzer='word',max_df = df, max_features = mf,stop_words='english', lowercase = True, token_pattern=r"(?u)\b\w\w+\b|!|.|\?|\"|\'")
        X_VTr = vectorizer2.fit_transform(X_Train)
        X_VTe = vectorizer2.transform(X_Test)
        clf = classifier(max_iter = 300)
        
        #fit
        clf.fit(X_VTr, Y_Train)
        
        #predict
        Y_hat_b = clf.predict(X_VTe)

        #scoring taux de bonne classif
        c = 0
        for i in range(Y_Test.size):
            if Y_Test[i] == Y_hat_b[i]:
                c += 1
        liste_score_bonneclassif.append(c/Y_Test.size)
        
    liste_score_bonneclassif = np.array(liste_score_bonneclassif)
    return liste_score_bonneclassif

In [5]:
df, mf = (0.28, 9500)
l = score_pred_param(df,mf)
print(l.mean())

0.825
