In [3]:
import torch
from torch.autograd import Variable
import random as rd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

###### Dataset Trump Tweets

On utilise l'ensemble des tweets postés par Donald Trump.

In [4]:
data = pd.read_csv("trump.csv", dtype=str, delimiter=",", usecols=[2])
data.sample(5)

Unnamed: 0,Tweet_Text
3448,"Time to get out &amp; caucus! @IvankaTrump, @D..."
1426,"""@RoxaneTancredi: Democrats are coming to TRU..."
6326,"""@BOB_EWASHINGTON: @realDonaldTrump @club4grow..."
2342,"""@Ollie_621: @FoxNews @realDonaldTrump I think..."
566,"About to begin a rally here in Henderson, Neva..."


On utilise les fonctions ord et chr pour facilement convertir les caractères en valeurs numériques, et on crée les ensembles d'apprentissage et de test.

In [5]:
BATCH_SIZE = 1
#TWEET_SIZE = 280
VOCAB_SIZE = 128
#FINAL_CHAR = 160 # espace insécable
FINAL_CHAR = VOCAB_SIZE
 
BLANK_OHOT = torch.FloatTensor(BATCH_SIZE, VOCAB_SIZE+1).zero_()
 
def encode(row):
   
    # Conversion en int, et suppression des cractères spéciaux (emojis, bugs, etc)
    l = [ord(c) for c in row[0] if ord(c) < VOCAB_SIZE] #[:TWEET_SIZE-1] et limitation en taille (aberrations csv)
   
    # Normalisation longueur tweets
    '''
    for i in range(len(l), TWEET_SIZE-1):
        l.append(FINAL_CHAR)
    l.append(FINAL_CHAR)
    '''    
   
    # Ajout caractère de fin (espace insécable)
    l.append(FINAL_CHAR)
   
    return l


def decode(row):
    return ''.join([chr(int(c)) for c in row if int(c) != FINAL_CHAR])
 

def toohot(value):
    blank = torch.FloatTensor(len(value), VOCAB_SIZE+1).zero_()
    return blank.scatter_(1, value.view(-1, 1), 1)
 

def tofint(onehot):
    return onehot.max(-1)[1][0]


#data_e = data.apply(encode, axis=1)
#train = Variable(torch.Tensor(data_e[0:16000].as_matrix()))  # .view(-1, BATCH_SIZE, TWEET_SIZE)
#test  = Variable(torch.Tensor(data_e[16000:20800].as_matrix()))  # .view(-1, BATCH_SIZE, TWEET_SIZE)
 
dataset = [c for r in data.apply(encode, axis=1) for c in r] # Conversion et applatissement
#dataset = dataset[:-(len(dataset)%BATCH_SIZE)]               # Coupe taille divisible en batchs entiers
train   = torch.LongTensor(dataset[:700000])
test    = torch.LongTensor(dataset[700000:])

In [6]:
def sample(raw=train, length=50, index=None):
    if index is None:
        index = rd.randint(0, len(train))
    return raw[index:index+length]#, raw[index+1:index+length+1]

def tweet(raw=data):
    return data.sample().iloc[0,0]

In [7]:
decode(sample(train, 200))

'im or hate him, @realDonaldTrump is looking virtually unstoppable for Republican nomination.\nMore people attack him_RT @ABCPolitics: JUST IN: Donald Trump hits 41% in new Monmouth national poll, his '

In [8]:
data.sample().iloc[0,0]

'Congratulations to my brother Robert &amp; Ann Marie on the success of @MontesKitchen in Dutchess County, New York (Amenia.) Great food!'

In [9]:
class Recurent(nn.Module):
    def __init__(self, tailleZ, tailleVoc, actF = nn.Sigmoid()):
        '''X et Y vecteur onehot de taille tailleVoc
        Z vecteur de stockage de taille tailleZ'''
        super(Recurent, self).__init__()
        self.XToY = nn.Linear(tailleVoc, tailleVoc)
        self.XToZ = nn.Linear(tailleVoc, tailleZ)
        self.ZToY = nn.Linear(tailleZ, tailleVoc)
        self.ZToZ = nn.Linear(tailleZ, tailleZ)
        self.actF = actF

    def forward(self, x, z):
        '''return Y, Z'''
        #print("x", x, "z", z)
        return self.actF(self.XToY(x) + self.ZToY(z)), self.actF(self.XToZ(x) + self.ZToZ(z))

In [10]:
class RecurentGated(nn.Module):
    def __init__(self, tailleZ, tailleVoc, actF = nn.Sigmoid()):
        '''X et Y vecteur onehot de taille tailleVoc
        Z vecteur de stockage de taille tailleZ'''
        super(Recurent, self).__init__()
        self.XToY = nn.Linear(tailleVoc, tailleVoc)
        self.XToZ = nn.Linear(tailleVoc, tailleZ)
        self.ZToY = nn.Linear(tailleZ, tailleVoc)
        self.ZToZ = nn.Linear(tailleZ, tailleZ)
        self.gateZ = nn.Linear(tailleVoc, tailleZ)
        self.gateY = nn.Linear(tailleVoc, tailleVoc)
        self.actF = actF

    def forward(self, x, z):
        '''return Z, Y'''
        soft = torch.nn.Softmax(-1)
        return soft(self.XtoZ(x)) * self.actF(self.gateZ(x)) + self.actF(self.ZtoZ(z)) * (1-self.actF(self.gateZ(x))),\
    self.actF(self.XtoY(x)) * self.actF(self.gateY(x)) + self.actF(self.ZtoY(z)) + (1 - self.actF(self.gateY(x)))

In [48]:
class RNNMono(object):
    def __init__(self, recur, stop):
        super(RNNMono, self).__init__()
        self.recur = recur
        self.stop = stop
        self.losses = []
    
    def predict(self, boot, z):
        for i in boot:
            z, x = self.recur(i,z)
        r = []
        r.append(x)
        while (x.max(-1)[1].data != self.stop.max(-1)[1]).all() :
            z, x = self.recur(x,z)
            #print(voc.inverse[x.max(-1)[1].data[-1]])
            r.append(x)
        return r, z
    
    def train(self, inputs, z):
        '''
            :param inputs: Input sequence
        '''
        loss = nn.MSELoss()
        sgd  = optim.SGD(self.recur.parameters(), lr=1e-3)
        
        i = Variable(BLANK_OHOT)
        r, z = self.recur(i, z)
        
        for i in inputs:
            i = i.view(1, 129)
            loss = loss.forward(i, r)
            loss.backward()
            self.losses.append(loss)
            sgd.step()
            
            r, z = self.recur(i, z)
        
        return z

In [49]:
zLen = 1000
recur = Recurent(zLen, VOCAB_SIZE+1)
m = RNNMono(recur, toohot(torch.LongTensor([FINAL_CHAR])))
TRAIN_ITER = 500

lossHisto = []
scoreHisto = []

for i in range(TRAIN_ITER):
    
    if i%(TRAIN_ITER/10) == 0:
        print("Iteration", i)
    
    x = Variable(toohot(torch.LongTensor(encode(tweet()))))
    z = Variable(torch.zeros(zLen).type(torch.FloatTensor))
    
    f,_ = m.train(x, z)
    print(f, y)
    loss = ml.forward(f, y).backward()
    opt.step()
    
    lossHisto.append(loss.data.mean())
    ypred = torch.max(f, 1)[1]
    scoreHisto.append(torch.eq(ypred.data, yN).float().mean())
    
plt.plot(lossHisto)
plt.ylabel('loss')
plt.xlabel('nb batch traited')
plt.show()
plt.plot(scoreHisto)
plt.ylabel('score')
plt.xlabel('nb batch traited')
plt.show()

Iteration 0


AssertionError: nn criterions don't compute the gradient w.r.t. targets - please mark these variables as volatile or not requiring gradients

In [10]:
def sequencer(x, end):
    r = []
    for i in x :
        r.append(i)
        if i == end :
            yield r
            r = []

In [11]:
for i in sequencer(ten[:1000], 35):
    print(code2char(i, voc))

dave aneckstein4 simmons research4 an ezperian company they have not been charged or formally arrested5 iran isnt making an atomic bomb4 not at all4 chave.
 said monday5 the japanesemade tin robots have blocky heads and moveable arms and legs5 if they could no longer be the nominees4 then they would be pundits of the first order men with credibility on oval office matters by dint of once sitting in the chair themselves5 free challenge kits have a cd and brochure from dr5 ian4 menu and fitness advice and a pedometer to count steps5 the world motor sport council received statements from fernando alonso4 lewis hamilton and pedro de la rosa stating categorically no ferrari information had been used by mclaren4 and that no confidential data had been passed to the team5 the prime minister said the first citi.


In [None]:
torch.manual_seed(1)
zLen = 1000
vocLen = 36
recur = Recurent(zLen, vocLen)

endN = torch.LongTensor([vocLen-1])
end_onehot = torch.FloatTensor(*endN.size(), vocLen).zero_()
end_onehot.scatter_(1, torch.unsqueeze(endN, 1), 1.)

m = RNNMono(recur, end_onehot)
ml = nn.MSELoss()
ite = 500
opt = optim.SGD(m.parameters(), lr=1e-3)

lossHisto = []
scoreHisto = []

for i,seq in enumerate(sequencer(ten, 35)):
    if i > ite :
        break
    if i%(ite/10) == 0:
        print("Iteration", i)
    
    xN = torch.LongTensor(seq[:30])
    x_onehot = torch.FloatTensor(*xN.size(), vocLen).zero_()
    x_onehot.scatter_(1, torch.unsqueeze(xN, 1), 1.)
    x = autograd.Variable(x_onehot.type(torch.FloatTensor))
    
    yN = torch.LongTensor(seq[30:])
    y_onehot = torch.FloatTensor(*yN.size(), vocLen).zero_()
    y_onehot.scatter_(1, torch.unsqueeze(yN, 1), 1.)
    y = autograd.Variable(y_onehot.type(torch.FloatTensor))
    
    z = autograd.Variable(torch.zeros(zLen).type(torch.FloatTensor))
    
    f,_ = m.forward(x, z)
    print(f, y)
    loss = ml.forward(f, y)
    loss.backward()
    opt.step()
    
    lossHisto.append(loss.data.mean())
    ypred = torch.max(f, 1)[1]
    scoreHisto.append(torch.eq(ypred.data, yN).float().mean())
    
plt.plot(lossHisto)
plt.ylabel('loss')
plt.xlabel('nb batch traited')
plt.show()
plt.plot(scoreHisto)
plt.ylabel('score')
plt.xlabel('nb batch traited')
plt.show()