In [1]:
import re
import unidecode
import nltk
import pandas as pd

In [2]:
%pylab inline
pylab.rcParams['figure.figsize'] = (17, 9)

Populating the interactive namespace from numpy and matplotlib


In [3]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('amp')
stopwords.append('like')
stopwords.append('via')
stopwords.append('could')
stopwords.append('must')
stopwords.append('would')
stopwords.append('think')

In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

# Loading Data

In [5]:
tweets = list()
all_info = list()
with open('elections_tweets_en.csv') as f:
    lines = f.readlines()
    for row in lines:
        split = row.split(';')
        if split[14] == '0':
            tweets.append(split[15])
            all_info.append(split)


# Plot function 

In [139]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS

def plot_2D_clusters(data, labels, clusters_names, file, tf_idf):
    
    
    xs,ys = calculate_coordinates(data, tf_idf)
    
    
    
    df = pd.DataFrame(dict(x=xs, y=ys, label=labels)) 
    
    groups = df.groupby('label')


    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9)) # set size
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
                label=cluster_names[name], 
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point
    fig.savefig('plots/'+file+'.png', dpi=fig.dpi)
    plt.close()

    
    


In [141]:
def calculate_coordinates(data, tf_idf):
    print("Transoforming the data")
    tfidf_data = tf_idf.transform(data)
    print("Cosine similarity")
    dist = 1 - cosine_similarity(tfidf_data)
    print("Creating 2D data")
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist) 
    
    return pos[:,0], pos[:,1]

In [140]:
from __future__ import division
import math
from random import randint

def select_samples(points, numb):
    
    samples = list()
    rand_indx = set()
    limits = dict()
    samples = list()
    labels = list()
    frame = pd.DataFrame(points)
    x = frame[0].value_counts()
    total_samples = 0
    
    for key in x.keys():
        limit = math.ceil(numb*x[key]/len(tweets))
        if key not in limits:
            limits[key] = 0
        print("Distribution of the cluster " + str(key)+ " number of elements " + str(x[key]))
        limits[key] += limit
        total_samples += limit

    while(total_samples > 0):
        rand = randint(0, len(tweets))
        
        while(rand in rand_indx or limits[points[rand]] == 0):
            rand = randint(0, len(tweets))
        
        rand_indx.add(rand)
        samples.append(tweets[rand])
        labels.append(points[rand])
        total_samples -= 1
        
    return samples,labels
        

# Preprocessor

In [9]:
def twitter_preprocessor(text):
    text = unidecode.unidecode(text.lower())
    text = normalize_candidates_names(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub('[^a-zA-Z0-9 ]', '', text)
    return text.rstrip().lstrip().strip()

In [10]:
dict_politics = {'Nathalie Arthaud': 'Arthaud', '@n_arthaud' : 'Arthaud', 'Francois Asselineau': 'Asselineau', '@UPR_Asselineau': 'Asselineau',
       '@JCheminade' : 'Cheminade', 'Jacques Cheminade' : 'Cheminade', '@dupontaignan' : 'DupontAignan', 'Nicolas Dupont-Aignan' : 'DupontAignan',
        'Dupont Aignan' : 'DupontAignan', 'Dupont-Aignan' : 'DupontAignan', 'Nicolas Dupont': 'DupontAignan', 'Nicolas Dupont Aignan': 'DupontAignan',
       '@FrancoisFillon' : 'Fillon', 'Francois Fillon' : 'Fillon', 'Benoit Hamon' : 'Hamon', '@benoithamon' : 'Hamon', '@EmmanuelMacron' : 'Macron', 
        'Emmanuel Macron' : 'Macron', 'Marine Le Pen': 'LePen', 'Le Pen' : 'LePen', '@MLP_officiel' : 'LePen', '@JLMelenchon' : 'Melenchon', 
       'Jean-Luc Melenchon' : 'Melenchon', '@PhilippePoutou' : 'Poutou', 'Philippe Poutou' : 'Poutou', '@jeanlassalle' : 'Lassalle',
       'Jean Lassalle' : 'Lassale'}
def normalize_candidates_names(text):
    for key in dict_politics.keys():
        text = re.sub(key.lower(), dict_politics[key].lower(),text)
    return text

In [104]:
def clustersTOwords(cluster_centers, terms, num_clusters):
    cluster_names = dict()
    order_centroids = cluster_centers.argsort()[:, ::-1] 
    for i in range(num_clusters):
        words = 'Cluster'+str(i)+' : '
        for ind in order_centroids[i, :10]:
            words += terms[ind] + ','
        cluster_names[i] = words
        print(words)
    return cluster_names


# Tokenizer

In [11]:
def tokenize_stemmer(text):
    stems_words = list()
    tokens = [word for word in text.split() if len(word)>3]
    for token in tokens:
        if token not in stopwords:
            if re.search('[^a-zA-Z]', token):
                token = stemmer.stem(token)
            stems_words.append(token)
    return stems_words

# TF-IDF 

In [164]:
len(tweets)

235021

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df = 0.9,min_df = 0.01, 
                                   stop_words=stopwords,max_features=1000,
                                   use_idf=True, tokenizer=tokenize_stemmer,
                                    preprocessor = twitter_preprocessor)

In [29]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(tweets) 

print(tfidf_matrix.shape)

Wall time: 37.3 s
(235021, 61)


In [30]:
terms = tfidf_vectorizer.get_feature_names()

In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(tfidf_matrix.todense())
%time data2D = pca.transform(tfidf_matrix.todense())

Wall time: 2.13 s


# K-Means

In [163]:
from sklearn.cluster import KMeans
t = 'k-means'
cluster_list = dict()
for i in range(2,8):
    print("Calculating K-Means for num_clusters" +str(i))
    km = KMeans(n_clusters=i)
    
    %time km.fit(tfidf_matrix)
    samples,labels = select_samples(km.labels_.tolist(), 1000)
    
    file = t +str(i)
    
    cluster_list[i] = km.labels_.tolist()
    cluster_names = clustersTOwords(km.cluster_centers_, terms, i)
    
    %time plot_2D_clusters(samples, labels, cluster_names, file, tfidf_vectorizer)
    #center2D = pca.transform(km.cluster_centers_)
    #plot_2D_clusters(data2D,center2D,i, t)
    print("             ")
    print("             ")

Calculating K-Means for num_clusters2
Wall time: 1min 10s
Distribution of the cluster 1 number of elements 200665
Distribution of the cluster 0 number of elements 34356
Cluster0 : macron,lepen,france,french,vote,president,obama,election,people,candidate,
Cluster1 : lepen,french,macron,france,election,presidential,trump,vote,fillon,melenchon,
Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 44.9 s
             
             
Calculating K-Means for num_clusters3
Wall time: 1min 31s
Distribution of the cluster 0 number of elements 168032
Distribution of the cluster 2 number of elements 36378
Distribution of the cluster 1 number of elements 30611
Cluster0 : lepen,french,macron,france,election,presidential,trump,vote,fillon,melenchon,
Cluster1 : lepen,france,macron,vote,president,french,trump,support,people,wins,
Cluster2 : macron,lepen,france,french,vote,president,obama,want,election,people,
Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 1min
    

# K-Means Clustering with SVD

In [146]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

svd = TruncatedSVD(30)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
%time lsa_matrix = lsa.fit_transform(tfidf_matrix)

Wall time: 3.18 s


In [147]:
lsa_matrix.shape

(235021, 30)

In [149]:
from sklearn.cluster import KMeans
t = 'k-means_lsa'
cluster_list = dict()
for i in range(2,8):
    print("Calculating K-Means with LSA for num_clusters" +str(i))
    km = KMeans(n_clusters=i)
    
    %time km.fit(lsa_matrix)
    samples,labels = select_samples(km.labels_.tolist(), 1000)
    
    samples = tfidf_vectorizer.transform(samples)
    
    file = t +str(i)
    
    cluster_list[i] = km.labels_.tolist()
    cluster_names = clustersTOwords(km.cluster_centers_, terms, i)
    
    %time plot_2D_clusters(samples, labels, cluster_names, file, lsa)
    #center2D = pca.transform(km.cluster_centers_)
    #plot_2D_clusters(data2D,center2D,i, t)
    print("             ")
    print("             ")

Calculating K-Means with LSA for num_clusters2
Wall time: 4.96 s
Distribution of the cluster 1 number of elements 135344
Distribution of the cluster 0 number of elements 99677
Cluster0 : attack,back,brexit,c0nvey,hope,left,europe,elections,face,final,
Cluster1 : attack,c0nvey,candidate,debate,brexit,final,campaign,euro,europe,latest,
Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 50.9 s
             
             
Calculating K-Means with LSA for num_clusters3
Wall time: 8.53 s
Distribution of the cluster 0 number of elements 148588
Distribution of the cluster 2 number of elements 43952
Distribution of the cluster 1 number of elements 42481
Cluster0 : attack,brexit,c0nvey,candidate,debate,europe,final,face,euro,first,
Cluster1 : attack,lepen,hope,french,latest,left,holocaust,elections,lead,euro,
Cluster2 : attack,back,final,lepen,campaign,fillon,latest,macron,debate,hamon,
Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 58.1 s
             
  

# NMF

In [154]:
from sklearn.decomposition import NMF


In [161]:
from sklearn.cluster import KMeans

t = 'k-means_nmf'
cluster_list = dict()
for i in range(2,8):
    print("Calculating K-Means with NMF for num_clusters" +str(i))
    model = NMF(n_components=i, init='nndsvda', random_state=0, max_iter = 10000)
    %time nmf = model.fit_transform(tfidf_matrix)
    
    samples,labels = select_samples(nmf.argmax(axis=1), 1000)
    
    samples = tfidf_vectorizer.transform(samples)
    
    file = t +str(i)
    
    cluster_list[i] = nmf.argmax(axis=1)
    cluster_names = print_top_words(model, terms, 6)
    
    %time plot_2D_clusters(samples, labels, cluster_names, file, model)

    print("             ")
    print("             ")

Calculating K-Means with NMF for num_clusters2
Wall time: 1.81 s
Distribution of the cluster 0 number of elements 118644
Distribution of the cluster 1 number of elements 116377
Topic #0:
lepen france french election trump vote
Topic #1:
macron french france election presidential vote

Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 13 s
             
             
Calculating K-Means with NMF for num_clusters3
Wall time: 1.82 s
Distribution of the cluster 0 number of elements 84759
Distribution of the cluster 1 number of elements 76444
Distribution of the cluster 2 number of elements 73818
Topic #0:
lepen france trump vote europe paris
Topic #1:
macron france vote obama campaign president
Topic #2:
french election presidential candidate fillon debate

Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 12.9 s
             
             
Calculating K-Means with NMF for num_clusters4
Wall time: 2.15 s
Distribution of the cluster 2 number of elements

In [162]:
from sklearn.cluster import KMeans

t = 'k-means_nmf_nndsvd'
cluster_list = dict()
for i in range(2,8):
    print("Calculating K-Means with NMF for num_clusters" +str(i))
    model = NMF(n_components=i, init='nndsvd', random_state=0, max_iter = 10000)
    %time nmf = model.fit_transform(tfidf_matrix)
    
    samples,labels = select_samples(nmf.argmax(axis=1), 1000)
    
    samples = tfidf_vectorizer.transform(samples)
    
    file = t +str(i)
    
    cluster_list[i] = nmf.argmax(axis=1)
    cluster_names = print_top_words(model, terms, 6)
    
    %time plot_2D_clusters(samples, labels, cluster_names, file, model)

    print("             ")
    print("             ")

Calculating K-Means with NMF for num_clusters2
Wall time: 1.42 s
Distribution of the cluster 0 number of elements 120209
Distribution of the cluster 1 number of elements 114812
Topic #0:
lepen france french election trump vote
Topic #1:
macron french france election presidential vote

Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 9.95 s
             
             
Calculating K-Means with NMF for num_clusters3
Wall time: 1.48 s
Distribution of the cluster 0 number of elements 87637
Distribution of the cluster 1 number of elements 74663
Distribution of the cluster 2 number of elements 72721
Topic #0:
lepen france trump vote europe paris
Topic #1:
macron france vote obama campaign president
Topic #2:
french election presidential candidate fillon debate

Transoforming the data
Cosine similarity
Creating 2D data
Wall time: 13.2 s
             
             
Calculating K-Means with NMF for num_clusters4
Wall time: 1.58 s
Distribution of the cluster 0 number of elemen