In [1]:
import json
import gensim.models.word2vec as word2vec

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import re
import unicodedata
import numpy as np

In [2]:
def process_clean(filename, inputs, min_length=5, min_count=3):
    
    train = []
    nb = dict()
    
    log_tweet_courts = 0
    
    for i in range(len(inputs)):
        with open(inputs[i]+'.json','r') as f:
            for l in f.readlines():
                text = json.loads(l)['text']

                res = text.lower()
                res = unicodedata.normalize('NFD',res).encode('ascii','ignore')

                res = re.sub("(rt @[a-z0-9_]*|[\":!\?\.,\|\(\)]|https?://[a-z0-9\./]*)", "", res)
                res = re.sub("'","' ",res)
                res = re.sub("\\n|/",' ', res)
                #res = re.sub('[0-9]+( [0-9]+)*','<N>', res)
                res = res.strip()

                spl = res.split()
                if len(spl) < min_length:
                    log_tweet_courts += 1
                    continue
                
                for w in spl:
                    if w in nb:
                        nb[w]+=1
                    else:
                        nb[w]=1               
                        
                train.append({'data':res, 'label':i})
    
    logging.log(logging.INFO,"%d tweets supprimés car trop courts (<%d)"%(log_tweet_courts, min_length))
    
    nb_rare = 0
    for w in nb:
        if nb[w] < min_count:
            nb_rare+=1
    
    logging.log(logging.INFO, "%d mots enlevés du vocabulaire car trop rares (occurences < %d)"%(nb_rare,min_count))
    
    nb_rare_total = 0
    
    for l in train:
        d = l['data']
        ############## Très laid ###################
        oldres = d.split()
        res = [w for w in oldres if nb[w] >= min_count]
        nb_rare_total += len(oldres) - len(res)
        l['data'] = " ".join(res)
        ############################################
    logging.log(logging.INFO,"%d mots rares enlevés du corpus"%nb_rare_total)
    
    np.random.shuffle(train)
    

    g = open(filename+'.clean','w')
    for l in train:
        #Nouveau test de longueur vu qu'on a supprimé des mots
        """if len(l['data'].split()) >= min_length:
            g.write(json.dumps(l)+"\n")
        """
        g.write(json.dumps(l)+"\n")
    g.close()    


def process_embedding(filename, dimension, iter):
    f = open(filename+".clean","r")
    g = open(filename+".text","w")
    
    for l in f.readlines():
        text = json.loads(l)['data']
        g.write(text+'\n')
        
    f.close()
    g.close()
    dataset = word2vec.LineSentence(filename+".text")
    
    model = word2vec.Word2Vec(dataset, size=dimension, min_count=0, window=5, iter=iter)
    model.save(filename + ".model")
    
    with open(filename+".embedding","w") as f:
        for v in model.syn0:
            s = " ".join([str(vc) for vc in v])
            f.write(s+"\n")    
   
    with open(filename+".w2i", 'w') as f:
        for w in model.index2word:
            f.write(w+'\n')

    return model

def process_word2index(filename, model, seq, shift):
    f = open(filename+".clean","r")
    g = open(filename+".data","w")
    
    for l in f.readlines():
        j = json.loads(l)
        words = j['data'].split()
        
        indexes = [model.vocab[w].index for w in words]
        
        nb = len(indexes)
        if nb >= seq:
            for i in range(0,nb-seq+1,shift):
                data = indexes[i:i+seq]
                label = j['label']
                g.write(json.dumps({'data':data, 'label':label})+'\n')
            if i != nb-seq:
                data = indexes[-seq:]
                label = j['label']
                g.write(json.dumps({'data':data, 'label':label})+'\n')
        else:
            data = indexes
            label = j['label']
            g.write(json.dumps({'data':data, 'label':label})+'\n')
            
    g.close()
    f.close()

def process_config(filename, dimension, nb_words, sequencesize, label_dimension, label_name):
    conf = dict()
    conf['input_dimension'] = dimension
    conf['vocabsize'] = nb_words
    conf['sequencesize'] = sequencesize
    conf['label_dimension'] = label_dimension
    conf['label_name'] = label_name

    with open(filename+'.config','w') as f:
        json.dump(conf,f)

In [3]:
def process(final_name, inputs, labelname,iter=150, dimension=20, seq=10, min_count=3, min_length=5):
    shift = 4
    
    process_clean(final_name, inputs, min_length=5, min_count=min_count)
    model = process_embedding(final_name, dimension, iter)
    process_word2index(final_name, model, seq, shift)
    process_config(final_name, dimension, len(model.syn0), seq, len(inputs), labelname)
    return model

In [71]:
np.random.seed(0)
x = ['fn_officiel','francoisfillon','macron','xsqueezie','sncf','footmercato']
#x = ['fn_officiel','francoisfillon','macron']
pathx = ['dataset/tweets/users/user_'+w for w in x]
m=process("dataset/tweets/data/total1", pathx,x)
m=process("dataset/tweets/data/total2", pathx,x, 150, 20, 15)
m=process("dataset/tweets/data/total3", pathx,x,150, 32, 10)
m=process("dataset/tweets/data/total4", pathx,x, 1000, 20, 10)
m=process("dataset/tweets/data/total5", pathx,x, 500, 32, 12)
m=process("dataset/tweets/data/total6", pathx,x, 150, 16, 10)
m=process("dataset/tweets/data/total7", pathx,x, 500, 16, 10)
m=process("dataset/tweets/data/total8", pathx,x, 150, 20, 10, 1)
m=process("dataset/tweets/data/total9", pathx,x, 150, 20, 10, 1, 9)

2017-01-12 16:18:39,812 : INFO : 591 tweets supprimés car trop courts (<5)
2017-01-12 16:18:39,825 : INFO : 15293 mots enlevés du vocabulaire car trop rares (occurences < 3)
2017-01-12 16:18:39,943 : INFO : 18855 mots rares enlevés du corpus
2017-01-12 16:18:40,320 : INFO : collecting all words and their counts
2017-01-12 16:18:40,322 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-12 16:18:40,459 : INFO : PROGRESS: at sentence #10000, processed 140898 words, keeping 7996 word types
2017-01-12 16:18:40,560 : INFO : collected 8204 word types from a corpus of 250360 raw words and 17732 sentences
2017-01-12 16:18:40,631 : INFO : min_count=0 retains 8204 unique words (drops 0)
2017-01-12 16:18:40,633 : INFO : min_count leaves 250360 word corpus (100% of original 250360)
2017-01-12 16:18:40,673 : INFO : deleting the raw counts dictionary of 8204 items
2017-01-12 16:18:40,675 : INFO : sample=0 downsamples 0 most-common words
2017-01-12 16:18:40,676 : INFO :

In [72]:
m=process("dataset/tweets/data/tset", pathx,x, 150, 32, 15, 1)

2017-01-16 09:15:37,963 : INFO : 591 tweets supprimés car trop courts (<5)
2017-01-16 09:15:37,968 : INFO : 15293 mots enlevés du vocabulaire car trop rares (occurences < 3)
2017-01-16 09:15:38,057 : INFO : 18855 mots rares enlevés du corpus
2017-01-16 09:15:38,380 : INFO : collecting all words and their counts
2017-01-16 09:15:38,381 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-16 09:15:38,489 : INFO : PROGRESS: at sentence #10000, processed 141252 words, keeping 7988 word types
2017-01-16 09:15:38,570 : INFO : collected 8204 word types from a corpus of 250360 raw words and 17732 sentences
2017-01-16 09:15:38,594 : INFO : min_count=0 retains 8204 unique words (drops 0)
2017-01-16 09:15:38,596 : INFO : min_count leaves 250360 word corpus (100% of original 250360)
2017-01-16 09:15:38,632 : INFO : deleting the raw counts dictionary of 8204 items
2017-01-16 09:15:38,633 : INFO : sample=0 downsamples 0 most-common words
2017-01-16 09:15:38,634 : INFO :

In [4]:
np.random.seed(0)
x = ['fn_officiel','bfmtv','francoisfillon','bayrou','macron','jlmelenchon','gilbertcollard','mlp_officiel','xsqueezie','sncf','grouperatp','policenationale','armee_de_lair','paris','footmercato']
#x = ['fn_officiel','francoisfillon','macron']
pathx = ['dataset/tweets/users/user_'+w for w in x]
m=process("dataset/tweets/data/allmorenew", pathx,x, 150, 32, 15)

2017-01-20 08:44:12,704 : INFO : 1094 tweets supprimés car trop courts (<5)
2017-01-20 08:44:12,715 : INFO : 31919 mots enlevés du vocabulaire car trop rares (occurences < 3)
2017-01-20 08:44:12,960 : INFO : 39053 mots rares enlevés du corpus
2017-01-20 08:44:14,087 : INFO : collecting all words and their counts
2017-01-20 08:44:14,089 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-20 08:44:14,230 : INFO : PROGRESS: at sentence #10000, processed 146549 words, keeping 13674 word types
2017-01-20 08:44:14,360 : INFO : PROGRESS: at sentence #20000, processed 294342 words, keeping 16444 word types
2017-01-20 08:44:14,487 : INFO : PROGRESS: at sentence #30000, processed 440518 words, keeping 17348 word types
2017-01-20 08:44:14,614 : INFO : PROGRESS: at sentence #40000, processed 587000 words, keeping 17532 word types
2017-01-20 08:44:14,694 : INFO : collected 17543 word types from a corpus of 677969 raw words and 46247 sentences
2017-01-20 08:44:14,783 :

In [190]:
m['a']

array([ 0.1250754 , -0.00285063,  0.40124935,  0.00141173,  0.33862978,
       -0.429602  ,  0.14835995,  0.23800924,  0.41249526,  0.08483654,
       -0.14151812,  0.12941374, -0.35606965,  0.44945771,  0.08972403,
        0.40614638,  0.27541858, -0.00593278, -0.1066035 ,  0.5179745 ], dtype=float32)

In [34]:
m.most_similar('expulses',topn=20),m.most_similar('entreprise',topn=20),m.most_similar('last',topn=20)

([(u'asile', 0.8222172260284424),
  (u'libres', 0.7855370044708252),
  (u'#le79inter', 0.7750001549720764),
  (u'issus', 0.7504287958145142),
  (u'terrorisme', 0.742144763469696),
  (u'nationales', 0.739521861076355),
  (u'al', 0.7343869805335999),
  (u'qaida', 0.7282637357711792),
  (u'devraient', 0.723727285861969),
  (u'frontieres', 0.7098358869552612),
  (u'clandestins', 0.7088605761528015),
  (u'#bfmpolitique', 0.7081672549247742),
  (u'doivent', 0.7000168561935425),
  (u'payees', 0.6990599036216736),
  (u'maitriser', 0.6982598900794983),
  (u'39h', 0.6964713335037231),
  (u'#punchline', 0.6956908702850342),
  (u'nosra', 0.6943374872207642),
  (u'reels', 0.6842706203460693),
  (u'#rtlmatin', 0.6751416921615601)],
 [(u'formation', 0.8174219131469727),
  (u'investissement', 0.8099896907806396),
  (u'contribution', 0.7919543385505676),
  (u'competitivite', 0.7892634272575378),
  (u'europe', 0.7752705216407776),
  (u'economie', 0.7720529437065125),
  (u'innovation', 0.767096996307373)