In [1]:
import pandas as pd
import numpy as np

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch

import pickle

import time

from gensim.models import Word2Vec


In [2]:
# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


In [3]:
# #Load wor2vec of train
# train_word2vec_filename = 'word2vec/' + 'train_review_word2vec.csv'
# train_word2vec_df = pd.read_csv(train_word2vec_filename)
# train_word2vec_df

In [19]:
from sklearn.model_selection import train_test_split
train = pd.read_csv("train_clean_stem.csv", sep=';',index_col=0)
max_sen_len = train.commentaire_stemmed.map(len).max()
train = train[:10000]
# train, X_test = train_test_split(train, train_size=0.33, random_state=42)

In [20]:
len(train)

10000

In [65]:
size = 200
window = 3
min_count = 1
workers = 3
sg = 1

# Function to train word2vec model
def make_word2vec_model(train, padding=True, sg=1, min_count=1, size=500, workers=3, window=3):
    if  padding:
        print(len(train))
        temp_df = pd.Series(train['commentaire_stemmed']).values
        temp_df = list(temp_df)
        for i in range(min_count):
            temp_df.append(['pad'])
        word2vec_file = 'model/' + 'word2vec_' + str(size) + '_PAD.model'
    else:
        temp_df = train['commentaire_stemmed']
        word2vec_file = 'model/' + 'word2vec_' + str(size) + '.model'
    w2v_model = Word2Vec(temp_df, min_count = min_count, size = size, workers = workers, window = window, sg = sg)

    w2v_model.save(word2vec_file)
    return w2v_model, word2vec_file

# Train Word2vec model
w2vmodel, word2vec_file = make_word2vec_model(train, padding=True, sg=sg, min_count=min_count, size=size, workers=workers, window=window)

10000


In [66]:
word2vec_file = 'model/' + 'word2vec_'+str(size)+'_PAD.model'
w2vmodel = Word2Vec.load(word2vec_file)

In [67]:
padding_idx = w2vmodel.wv.vocab['pad'].index
def make_word2vec_vector_cnn(sentence):
    padded_X = [padding_idx for i in range(max_sen_len)]
    i = 0
    for word in sentence:
        if word not in w2vmodel.wv.vocab:
            padded_X[i] = 0
#             print(word)
        else:
            padded_X[i] = w2vmodel.wv.vocab[word].index
        i += 1
    return torch.tensor(padded_X, dtype=torch.long, device=device).view(1, -1)

In [68]:
# Function to get the output tensor
def make_target(label):
    return torch.tensor([label], dtype=torch.long, device=device)

In [69]:
# window_sizes=(1,2,3,5)

In [70]:
EMBEDDING_SIZE = size
NUM_FILTERS = 10
import gensim

class CnnTextClassifier(nn.Module):
    def __init__(self, vocab_size, num_classes, window_sizes=(1,2)):
        super(CnnTextClassifier, self).__init__()
        w2vmodel = gensim.models.KeyedVectors.load(word2vec_file)
        weights = w2vmodel.wv
        # With pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights.vectors), padding_idx=w2vmodel.wv.vocab['pad'].index)
        # Without pretrained embeddings
        # self.embedding = nn.Embedding(vocab_size, EMBEDDING_SIZE)

        print(EMBEDDING_SIZE)
        self.convs = nn.ModuleList([
                                   nn.Conv2d(1, NUM_FILTERS, [window_size, EMBEDDING_SIZE], padding=(window_size - 1, 0))
                                   for window_size in window_sizes
        ])

        self.fc = nn.Linear(NUM_FILTERS * len(window_sizes), num_classes)

    def forward(self, x):
        x = self.embedding(x) # [B, T, E]

        # Apply a convolution + max_pool layer for each window size
        x = torch.unsqueeze(x, 1)
        xs = []
        for conv in self.convs:
            x2 = torch.tanh(conv(x))
            x2 = torch.squeeze(x2, -1)
            x2 = F.max_pool1d(x2, x2.size(2))
            xs.append(x2)
        x = torch.cat(xs, 2)

        # FC
        x = x.view(x.size(0), -1)
        logits = self.fc(x)

        probs = F.softmax(logits, dim = 1)

        return probs


In [71]:
def eval(pred,Y):
    true = 0
    false = 0
    
    dtrue = {}
    dfalse = {}
    
    for i in range (len(pred)):
        if((pred[i]+1)/2 == float(Y[i].replace(',','.'))):
            true += 1
            if float(Y[i].replace(',','.')) not in dtrue:
                dtrue[float(Y[i].replace(',','.'))] = 0
            else:
                dtrue[float(Y[i].replace(',','.'))] += 1
        else:
            false += 1
            if float(Y[i].replace(',','.')) not in dfalse:
                dfalse[float(Y[i].replace(',','.'))] = 0
            else:
                dfalse[float(Y[i].replace(',','.'))] += 1
        if i % 20000 == 0:
            print('Eval',i)
            
    print(dtrue)
    print(dfalse)
    try:
        dratio = {dtrue[i] / (dtrue[i] + dfalse[i]) for i in dtrue}
    except:
        dratio = {}
    return (true / (true + false)), dratio
        

In [72]:
def pred(corp,cnn):
    start_time_predict = time.time()
    bow_cnn_predictions = []
    original_lables_cnn_bow = []
    cnn.eval()
    with torch.no_grad():
        for index, row in corp.iterrows():
            if index % 10000 == 0:
                print('Prediction index',index)
            bow_vec = make_word2vec_vector_cnn(row['commentaire_stemmed'])
            probs = cnn(bow_vec)
            _, predicted = torch.max(probs.data, 1)
            bow_cnn_predictions.append(predicted.cpu().numpy()[0])
    print("Time taken to predict: " + str(time.time() - start_time_predict))
    
    print(bow_cnn_predictions)
    
    return eval(bow_cnn_predictions,corp['note'])
    
    # loss_file_name = 'plots/' + 'cnn_class_big_loss_with_padding.csv'
    # loss_df = pd.read_csv(loss_file_name)
    # print(loss_df.columns)
    # plt_200_treduce_padding_3_epochs = loss_df[' loss'].plot()
    # fig = plt_200_treduce_padding_3_epochs.get_figure()
    # fig.savefig('plots/' + 'plt_200_treduce_padding_3_epochs.pdf')

In [73]:
dev = pd.read_csv("dev_clean_stem.csv", sep=';',index_col=0)
dev = dev[:10000]
#dev, X_dev = train_test_split(dev, train_size=0.33, random_state=42)

In [74]:
len(dev)

10000

In [None]:
NUM_CLASSES = 10
VOCAB_SIZE = len(w2vmodel.wv.vocab)

cnn_model = CnnTextClassifier(vocab_size=VOCAB_SIZE, num_classes=NUM_CLASSES)
cnn_model.to(device)
loss_function = nn.CrossEntropyLoss()
# loss_function = nn.MSELoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)
# optimizer = optim.SGD(cnn_model.parameters(), lr=0.001, momentum=0.9)
num_epochs = 30

# Open the file for writing loss
loss_file_name = 'plots/' + 'cnn_class_big_loss_with_padding.csv'
f = open(loss_file_name,'w')
f.write('iter, loss')
f.write('\n')
losses = []
best_dev_accuracy = 0
cnn_model.train()
for epoch in range(num_epochs):
    print("Epoch" + str(epoch + 1))
    train_loss = 0
    start_time_epoch = time.time()
    start_time_iterow = time.time()
    for index, row in train.iterrows():
        
        if index % 1000 == 0:
            print("Time taken for iterow "+str(index)+" : " + str(time.time() - start_time_iterow))
            start_time_iterow = time.time()
       
        
        # Clearing the accumulated gradients
        cnn_model.zero_grad()

        # Make the bag of words vector for stemmed tokens 
        bow_vec = make_word2vec_vector_cnn(row['commentaire_stemmed'])
       
        # Forward pass to get output
        probs = cnn_model(bow_vec)

        # Get the target label
        target = make_target(int((float(train['note'][index].replace(',','.'))*2)-1))

        # Calculate Loss: softmax --> cross entropy loss
        loss = loss_function(probs, target)
        train_loss += loss.item()

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()
        
    print("Time taken for epoch "+str(epoch)+" : " + str(time.time() - start_time_epoch)) 
    
    pred_dev, dratio = pred(dev,cnn_model)
    print('Prediction on dev',pred_dev)
    print(dratio)
    if pred_dev > best_dev_accuracy:
        best_dev_accuracy = pred_dev
        torch.save(cnn_model, 'model/' + 'cnn_model_200_wp_'+str(epoch)+'.pth')
        
    # if index == 0:
    #     continue
    print("Epoch ran :"+ str(epoch+1))
    f.write(str((epoch+1)) + "," + str(train_loss / len(train)))
    f.write('\n')
    train_loss = 0
    torch.save(cnn_model, 'model/' + 'cnn_model_200_wp.pth')


f.close()
print("Input vector")
print(bow_vec.cpu().numpy())
print("Probs")
print(probs)
print(torch.argmax(probs, dim=1).cpu().numpy()[0])

200
Epoch1
Time taken for iterow 0 : 0.0009777545928955078
Time taken for iterow 1000 : 14.210352182388306
Time taken for iterow 2000 : 14.056207418441772
Time taken for iterow 3000 : 15.087676286697388
Time taken for iterow 4000 : 14.00814962387085
Time taken for iterow 5000 : 13.786942720413208
Time taken for iterow 6000 : 14.023356914520264
Time taken for iterow 7000 : 14.30992603302002
Time taken for iterow 8000 : 14.078436136245728


In [None]:
# #Save test prediction
# f = open("result/results_svc_word2vec.txt", "a")
# for i in range(len(test_predictions_word2vec)):
#     f.write(test.iloc[i]['review_id'] + " " + str(test_predictions_word2vec[i]) + "\n")
# f.close()

Pour 5000 data pour size 200  
Equilibrer fenetre 1,2 ?spe (50000 data) 14%  
Non Equilibre fenetre 1,2 36spe          19%  
NE F3,5 spe  14%   

size 200 meilleur temps d'apprentissage ????  
size 300 ne f1,2 40spe 12%  
size 100 ne f1,2 60spe 18%  
size 200 ne f1,2,3 120spe 17% 