On utilise une approche encore plus forte d'apprentissage profond basée sur l'utilisation de réseaux de neurones récurrents. On suppose disposer d'un réseau déjà entrainé.

In [1]:
import matplotlib
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext.legacy.data import Field, TabularDataset, BucketIterator

import numpy as np
import pandas as pd
import os
import spacy
import string
import nltk
from nltk.corpus import stopwords
import re

In [2]:
def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
      # remove multiple spaces
        
        # remove newline
        text = re.sub(r'\n', ' ', text)

        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
        text = re.sub(r'#', '', text)
        text = re.sub(' +', ' ', text)

        text = re.sub(r'\d+', '', text)
        text = "".join([char.lower() for char in text if char not in string.punctuation]) 
        
        text = re.sub('\s+', ' ', text).strip()
        
        if text != '' and not text in stopwords.words("english"):
          cleaned_text.append(text)
    return cleaned_text

TEXT = Field(sequential=True, lower=True, include_lengths=False,
             batch_first = True, tokenize = str.split, preprocessing = cleanup_text)

SELECTED_TEXT = Field(sequential=True, batch_first = True, use_vocab = True, lower=True, 
                      tokenize = str.split, preprocessing = cleanup_text)


ID = Field(sequential=False, use_vocab = False, batch_first = True)

SENTIMENT = Field(sequential=False, use_vocab = True)


In [3]:
train_dataset = TabularDataset(
    format="csv", path= os.getcwd() + r"/donnees/train.csv",
    fields=[(None, None), ('text', TEXT), ('selected_text', SELECTED_TEXT), ('sentiment', SENTIMENT)]
)

In [4]:
TEXT.build_vocab(train_dataset, min_freq = 1, vectors = "glove.6B.100d")
SELECTED_TEXT.vocab = TEXT.vocab
SENTIMENT.build_vocab(train_dataset, min_freq = 1, vectors = "glove.6B.100d")

In [5]:
class LSTMModele(nn.Module):
  def __init__(self, emb_dim = 100):
    super(LSTMModele, self).__init__()
    self.embeddings = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze = False)
    self.lstm = nn.LSTM(input_size = emb_dim, hidden_size = emb_dim, batch_first = True, num_layers=1)
    self.fc1 = nn.Linear(emb_dim, 1)
    #self.act = nn.ReLU()
    #self.fc2 = nn.Linear(50, 1)

  def forward(self, inputs):
    embeds = self.embeddings(inputs)
    x, (h_n, c_n) = self.lstm(embeds)
    x = self.fc1(x)
    #x = self.act(x)
    #x = self.fc2(x)
    #print(x.shape)
    return torch.squeeze(x)

In [27]:
device = 'cpu'

In [7]:
model = LSTMModele()
model.load_state_dict(torch.load("Model-emb100-fc100_1", map_location=torch.device('cpu')))
model.eval()

LSTMModele(
  (embeddings): Embedding(26685, 100)
  (lstm): LSTM(100, 100, batch_first=True)
  (fc1): Linear(in_features=100, out_features=1, bias=True)
)

In [103]:
def extract_word(sentence, net, device):
    conv = []
    sentence = str.split(sentence)
    sentence = cleanup_text(sentence)
    #print(sentence)
    for word in sentence:
        conv.append(TEXT.vocab.stoi[word])

    sentence_tensor = torch.tensor([conv]).to(device)
    res = net(sentence_tensor)
    sentence_tensor = torch.squeeze(sentence_tensor)
    compteur = 0
    for i in iter(res):
        if round(float(i)) == 1:
            #print("Extraction :  ", sentence[compteur])
            yield sentence[compteur]
        compteur = compteur+1
        
def extraction_list(sentence):
    if sentence == "":
        return []
    return [word for word in extract_word(sentence, model, 'cpu')

In [155]:
for k in extract_word("awesome", model, 'cpu'):
    print(k)

TypeError: iteration over a 0-d tensor

In [101]:
def jaccard(str1, str2): 
    try:
        a = set(str1.lower().split())
        b = set(str2.lower().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    except:
        print(str1, str2)

def merge(L,i=0,j=None):
    """Creates sentences from lists of words."""
    if L == []:
        return ""
    if j is None:
        j = len(L)
    acc = L[i]
    for k in range(i+1,j):
        acc = acc + " " + L[k]
    return acc

def extract_closest_substring(str1, str2, score_fonc = jaccard ):
    """Extracts from str1 closest substring to str2 with respect to Jaccard distance."""
    """ /!\ Not optimal /!\ """
    split1 = str1.split(" ")
    N = len(split1)
    maxa = 0
    stra = str1
    for i in range(N):
        for j in range(i, N+1):
            acc = merge(split1,i,j)
            jacc = score_fonc(acc, str2)
            if acc != "" and jacc >= maxa:
                maxa = jacc
                stra = acc
    return stra

str1 = "Dans le concours, la métrique à utiliser est le score de Jaccard"
str2 = "La métrique est l'étude des objets métriques"

print(jaccard(str1,str2))
extract_closest_substring(str2, str1)

0.2


'La métrique est'

In [106]:
def row_fuse_and_extr(row):
    try:
        return extract_closest_substring(row.text, merge(extraction_list(row.text)))
    except :
        return row.text

class SmartClassifier():
    def __init__(self):
        return
    
    def sign_run(self,test_frame,sign):
        """Does a branch run"""
        if sign == "neutral":
            #neutral tweets return tweet itself
            aux = test_frame[test_frame.sentiment == sign]
            aux["selected_text"] = aux.text
            return aux.loc[:,["textID", "text", "selected_text", "sentiment"]]
        
        aux = test_frame
        aux["selected_text"] = aux.apply(row_fuse_and_extr, axis = 1)
                
        return aux.loc[:,["textID","text","selected_text","sentiment"]]
    
    def __call__(self, test_frame):
        """Runs the actual calculation"""
        acc_pos = self.sign_run(test_frame, "positive")
        acc_neg = self.sign_run(test_frame, "negative")
        acc_neu = self.sign_run(test_frame, "neutral")
        
        return pd.concat([acc_pos, acc_neg, acc_neu])
    

In [None]:
#Examples
R = SmartClassifier()
KK = R(test_set)
KK = KK.sort_index()
KK

In [111]:
p = 100
KK[KK.sentiment == "positive"].iloc[p:p+50,:]

Unnamed: 0,textID,text,selected_text,sentiment
161,c7240790da,Aren`t you suppossed to support the local eco...,good,positive
161,c7240790da,Aren`t you suppossed to support the local eco...,good,positive
162,08ede23635,good to know thanks,good to know,positive
162,08ede23635,good to know thanks,good to know,positive
166,e34af4fd97,"E.L.O. wow, brings back so many happy memori...",brings back so many happy,positive
166,e34af4fd97,"E.L.O. wow, brings back so many happy memori...",brings back so many happy,positive
167,0e8aa10a4e,that`s a very cute picture ... but you don`t ...,cute,positive
167,0e8aa10a4e,that`s a very cute picture ... but you don`t ...,cute,positive
168,29148dc95c,oh i love sunday mornings like this - mum jus...,love,positive
168,29148dc95c,oh i love sunday mornings like this - mum jus...,love,positive


In [123]:
KK.loc[244,:].text.iloc[0]

'MY SiSTER iN LAW JUST LEFT FOR HER PROM!! SHE LOOKED SOO PRETTY.. TEARS ALL AROUND'

In [135]:
# Load the dataset
train_set = pd.read_csv(r"train.csv")
test_set = pd.read_csv(r"test.csv")

#print(train_set.shape)    => (27481, 4)  (textID, text, selected_text, sentiment )
#print(test_set.shape)     => (3534, 3)   (textID, text, sentiment)

train_data_text = train_set["text"]
train_data_sentiment = train_set["sentiment"]

N_train = train_data_text.shape[0]

train_data = [(train_data_text[k],train_data_sentiment[k]) for k in range(N_train)]
train_labels = train_set["selected_text"]


# check data makes sense

print("Exemple 1 :",train_data[1], "extrait choisi :", train_labels[1])
print("Exemple 2 :",train_data[59], "extrait choisi :", train_labels[59])
print("Exemple 3 :",train_data[750], "extrait choisi :", train_labels[750])

Exemple 1 : (' Sooo SAD I will miss you here in San Diego!!!', 'negative') extrait choisi : Sooo SAD
Exemple 2 : (' what fun are you speaking of?', 'neutral') extrait choisi : what fun are you speaking of?
Exemple 3 : (' Thanks! Welcome back!', 'positive') extrait choisi : Thanks! Welcome back!


In [150]:
aux2 = aux[aux.ouss == 0].loc[:, ["sentiment","text","ouss"]]

In [152]:
p = 0
aux2.iloc[p:p+50,:]

Unnamed: 0,sentiment,text,ouss
4,positive,http://twitpic.com/4w75p - I like it!!,0
16,negative,Miss you,0
17,negative,Cramps . . .,0
49,negative,Not happy,0
92,neutral,"Yes, I am",0
106,neutral,yeh me 2,0
130,positive,Welcome!,0
135,negative,Now I have a sunburn,0
170,neutral,is ONLiNE http://plurk.com/p/stjdg,0
194,positive,woot!,0
