PHO Vinh-Son 3802052 <br>
CHOI Esther 3800370

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import codecs
import re
import os.path

import sklearn.naive_bayes as nb
from sklearn import svm
from sklearn import linear_model as lin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


def conf(Y_Test,Y_hat):
    confusion = np.zeros((2,2))
    Y_hat = (Y_hat+1)//2
    Y_Test = (Y_Test+1)//2
    for i in range(Y_Test.size):
        confusion[Y_Test[i],Y_hat[i]] +=1
    return confusion
    
def f1(conf):
    tp = conf[0,0]
    false = conf[0,1] + conf[1,0]
    return (tp/(tp+0.5*false))

def load_pres(fname):
    alltxts = []
    alllabs = []
    s=codecs.open(fname, 'r','utf-8') # pour régler le codage
    while True:
        txt = s.readline()
        if(len(txt))<5:
            break
        #
        lab = re.sub(r"<[0-9]*:[0-9]*:(.)>.*","\\1",txt)
        txt = re.sub(r"<[0-9]*:[0-9]*:.>(.*)","\\1",txt)
        if lab.count('M') >0:
            alllabs.append(-1)
        else: 
            alllabs.append(1)
        alltxts.append(txt)
    return alltxts,alllabs

def load_pres_T(fname):
    alltxts_T = []
    s=codecs.open(fname, 'r','utf-8') # pour régler le codage
    while True:
        txt = s.readline()
        if(len(txt))<5:
            break
        txt = re.sub(r"<[0-9]*:[0-9]*>(.*)","\\1",txt)
        alltxts_T.append(txt)
    return alltxts_T

def crossval(alltxts,alllabs,cv = 5,shuffle = False, seed = 42):
    ind = np.array([i for i in range(alllabs.size)])
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(ind)
    splitind = np.array_split(ind, cv)
    lcrossval = []
    for indices in splitind:
        lcrossval.append((alltxts[indices],np.delete(alltxts,indices),alllabs[indices],np.delete(alllabs,indices)))
    return lcrossval

def gaussianKernel(n):
    res = np.ones(2*n+1)
    sig = (n+1)/3
    for i in range(n):
        tmp = np.exp((-(i+1)**2)/(2*sig))
        res[n+(i+1)]= tmp
        res[n-(i+1)]= tmp
    return res #c'est un kernel proportionnel à un "vrai" kernel gaussien

def smoothing(y_hat,smooth,mode = "hard"):
    if mode == "FFT":
        new = np.array(y_hat)
        new = (new+1)//2 #on passe en représentation 0, 1.
        fftsig = np.fft.fft(new)
        tmp = np.fft.fftfreq(new.size)
        
        for i in range(fftsig.size):
            if np.abs(tmp[i]) > smooth[0] :
                fftsig[i] =  0
        new = np.rint(np.abs(np.fft.ifft(fftsig))).astype(int)
        
        for i in range(new.size): #si erreurs dans la reconstruction
            if new[i] > 1:
                new[i] = 1
        return 2*new -1 #on repasse en représentation -1,1
    
    else:    
        new = np.array(y_hat).astype('float64')
        for k in smooth:
            if mode == "Gaussian":
                ker = gaussianKernel(k)
            for i in range(k,new.size-k):
                if mode == "hard":
                    new[i] = np.sign(np.sum(new[i-k:i+k+1])) #equivalent signe(moyenne (...))
                elif mode == "soft":
                    new[i] = np.mean(new[i-k:i+k+1]) #on ne prend pas encore le signe
                elif mode == "Gaussian":
                    new[i] = np.sign(np.sum((new[i-k:i+k+1]*ker)))
        return np.sign(new).astype('int32')



In [2]:
def score_pred_param(df,mf,b,smooth,smoothmode = "hard", fname = "corpus.tache1.learn.utf8",cv_fold = 5,vectorizer = CountVectorizer, classifier = lin.LogisticRegression):
    #extraction
    alltxts,alllabs = load_pres(fname)
    alltxts = np.array(alltxts)
    alllabs = np.array(alllabs)
    
    lcrossval = crossval(alltxts,alllabs,cv = cv_fold)
    
    liste_score = []
    liste_score_untouched = []
    for X_Test, X_Train, Y_Test, Y_Train in lcrossval:

        #init
        vectorizer2 = vectorizer(analyzer='word',max_df = df, max_features = mf, lowercase = False)
        X_VTr = vectorizer2.fit_transform(X_Train)
        X_VTe = vectorizer2.transform(X_Test)
        clf = classifier(max_iter = 300)

        #fit
        clf.fit(X_VTr, Y_Train)  
        #test
        proba = clf.predict_proba(X_VTe)

        #adjust
        Y_hat_b = []
        for i in range(Y_Test.size):
            if proba[i][0] > b: #hyperparam Biais
                Y_hat_b.append(-1)
            else:
                Y_hat_b.append(1)
        Y_hat_b = np.array(Y_hat_b)

        #score before smoothing
        conf_b = np.zeros((2,2))
        Y_hat_b_o = (Y_hat_b+1)//2
        Y_Test_o = (Y_Test+1)//2
        for i in range(Y_Test.size):
            conf_b[Y_Test_o[i],Y_hat_b_o[i]] +=1
        liste_score_untouched.append(f1(conf_b))

        #smoothing
        Y_hat_b = smoothing(Y_hat_b,smooth, mode = smoothmode)

        #scoring
        conf_b = np.zeros((2,2))
        Y_hat_b = (Y_hat_b+1)//2
        Y_Test = (Y_Test+1)//2
        for i in range(Y_Test.size):
            conf_b[Y_Test[i],Y_hat_b[i]] +=1
        liste_score.append(f1(conf_b))
    liste_score = np.array(liste_score)
    liste_score_untouched = np.array(liste_score_untouched)

    return liste_score, liste_score_untouched

In [3]:
#KZF modifié 
df,mf,b,smooth = (0.11, 10000, 0.144, [1, 2, 3, 4, 5, 6, 7, 8, 2])
l1, l2 = score_pred_param(df,mf,b,smooth)
print(l1.mean())

0.8067196267531151


In [4]:
#Score avec KZF modifié
#(0.18, 10000, 0.134, [1, 2, 3, 4, 5, 6, 7, 8, 2])
#0.8034365297795751

#(0.11, 10000, 0.144, [1, 2, 3, 4, 5, 6, 7, 8, 2])
#0.8067196267531151

In [5]:
#KZF -> Utiliser smoothmode = "soft"
df,mf,b,smooth = (0.11, 10000, 0.144, [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1])
l1, l2 = score_pred_param(df,mf,b,smooth,smoothmode = "soft")
print(l1.mean())

0.7969941427268449


In [6]:
#Gaussian 
df,mf,b,smooth = (0.18, 10000, 0.134, [14])
l1, l2 = score_pred_param(df,mf,b,smooth,smoothmode = "Gaussian")
print(l1.mean())

0.7650996393937654


In [7]:
#FFT
df,mf,b,smooth = (0.18, 10000, 0.134, [0.035])
l1, l2 = score_pred_param(df,mf,b,smooth,smoothmode = "FFT")
print(l1.mean())

0.7801624462719565


In [8]:
def prediction(df,mf,b,smooth,smoothmode = "hard", fnameL = "corpus.tache1.learn.utf8",fnameT = "corpus.tache1.test.utf8",fnameP = "pred_presidents.txt",vectorizer = CountVectorizer, classifier = lin.LogisticRegression):
    alltxts_T = load_pres_T(fnameT)
    alltxts_T = np.array(alltxts_T)
    
    alltxts,alllabs = load_pres(fnameL)
    alltxts = np.array(alltxts)
    alllabs = np.array(alllabs)
    
    vectorizer2 = vectorizer(analyzer='word',max_df = df, max_features = mf, lowercase = False)
    X_VTr = vectorizer2.fit_transform(alltxts)
    X_VTe = vectorizer2.transform(alltxts_T)
    clf = classifier(max_iter = 300)
    
    #fit
    clf.fit(X_VTr, alllabs)
    #predict
    proba = clf.predict_proba(X_VTe)
    #adjust
    Y_hat_b = []
    for i in range(X_VTe.shape[0]):
        if proba[i][0] > b: #hyperparam Biais
            Y_hat_b.append(-1)
        else:
            Y_hat_b.append(1)
    Y_hat_b = np.array(Y_hat_b)
    
    #smoothing
    Y_hat_b = smoothing(Y_hat_b,smooth, mode = smoothmode)
        
    #storing
    f = open(fnameP, "w")
    for i in Y_hat_b:
        f.write(str(i)+"\n")
    f.close()

In [9]:
df,mf,b,smooth = (0.19,13250,0.145,[1,2,3,4,5,6,7,8,2])
prediction(df,mf,b,smooth)