## Twitter search with word embeddings - ALLIE Arthur - EFREI - M1 BDIA APP

In [168]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))
from collections import defaultdict
import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding
from keras.initializers import Constant
from sklearn.model_selection import train_test_split

In [169]:
data = pd.read_csv('tweets.csv')

In [170]:
data.head(5)

Unnamed: 0,id,created_at,text
0,849636868052275200,2017-04-05 14:56:29,b'And so the robots spared humanity ... https:...
1,848988730585096192,2017-04-03 20:01:01,"b""@ForIn2020 @waltmossberg @mims @defcon_5 Exa..."
2,848943072423497728,2017-04-03 16:59:35,"b'@waltmossberg @mims @defcon_5 Et tu, Walt?'"
3,848935705057280001,2017-04-03 16:30:19,b'Stormy weather in Shortville ...'
4,848416049573658624,2017-04-02 06:05:23,"b""@DaveLeeBBC @verge Coal is dying due to nat ..."


### 1- Perform the necessary pre-processing on the tweets

- Removing urls

In [171]:
def removing_url(data):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', data)

In [172]:
example = "My data is : https://www.vivastreet.com/"
removing_url(example)

'My data is : '

In [173]:
data['text'] = data['text'].apply(lambda x : removing_url(x))

- General Data Cleaning

In [174]:
import dataclasses

def data_cleaning(data):
    symbols_cleaning = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return symbols_cleaning.sub(r'', data)

In [175]:
example = "Hello dude 😸😸😸"
data_cleaning(example)

'Hello dude '

In [176]:
data['text'] = data['text'].apply(lambda x: data_cleaning(x))

- Removing punctuations

In [177]:
def removing_punctuation(data):
    table = str.maketrans('','',string.punctuation)
    return data.translate(table)

In [178]:
example = "You are a ###### genius"
removing_punctuation(example)

'You are a  genius'

In [179]:
data['text'] = data['text'].apply(lambda x : removing_punctuation(x))

In [180]:
def removing_punctuation_bis(data):
    punct_tag=re.compile(r'[^\w\s]')
    return punct_tag.sub(r'',data)

In [181]:
example = "You are a ###### genius"
removing_punctuation_bis(example)

'You are a  genius'

In [182]:
data['text'] = data['text'].apply(lambda x : removing_punctuation_bis(x))

- Removing html

In [183]:
def removing_html(data):
    html_tag = re.compile(r'<.*?>')
    return html_tag.sub(r'', data)

In [184]:
example = "<p> What an economic crisis </p>"
removing_html(example)

' What an economic crisis '

In [185]:
data['text'] = data['text'].apply(lambda x : removing_html(x))

- Removing @ and rt

In [186]:
# removing @
def removing_at(data):
  return re.sub('@[\w]+', '', data)

In [187]:
example = "b""RT @OpenAI: We've created the world's first Spam-detecting AI trained entirely in simulation and deployed on a physical robot: https://t.co\xe2\x80\xa6"""
removing_at(example)

"bRT : We've created the world's first Spam-detecting AI trained entirely in simulation and deployed on a physical robot: https://t.coâ\x80¦"

In [188]:
data['text'] = data['text'].apply(lambda x : removing_at(x))

In [189]:
# removing rt
def removing_rt(data):
  return re.sub('RT', '', data)

In [190]:
example = "b""RT @OpenAI: We've created the world's first Spam-detecting AI trained entirely in simulation and deployed on a physical robot: https://t.co\xe2\x80\xa6"""
removing_rt(example)

"b @OpenAI: We've created the world's first Spam-detecting AI trained entirely in simulation and deployed on a physical robot: https://t.coâ\x80¦"

In [191]:
data['text'] = data['text'].apply(lambda x : removing_rt(x))

- Spelling Correction

In [192]:
# !pip install pyspellchecker

In [193]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(data):
    corrected_text = []
    misspelled_words = spell.unknown(data.split())
    for word in data.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [194]:
example = "pleese u re an idiote"
correct_spellings(example)

'please u re an idiot'

In [195]:
data.head(5)

Unnamed: 0,id,created_at,text
0,849636868052275200,2017-04-05 14:56:29,bAnd so the robots spared humanity
1,848988730585096192,2017-04-03 20:01:01,bForIn2020 waltmossberg mims defcon5 Exactly T...
2,848943072423497728,2017-04-03 16:59:35,bwaltmossberg mims defcon5 Et tu Walt
3,848935705057280001,2017-04-03 16:30:19,bStormy weather in Shortville
4,848416049573658624,2017-04-02 06:05:23,bDaveLeeBBC verge Coal is dying due to nat gas...


### 2- Apply word embedding to the pre-processed tweets, using the GloVe model 

In [196]:
# def create_corpus(data):
#     corpus = []
#     for tweet in tqdm(data['text']):
#         words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
#         corpus.append(words)
#     return corpus

In [197]:
# corpus=create_corpus(data)

In [198]:
from gensim.utils import simple_preprocess

# This is a tokenization to get a new column calling 'tokenized_text' from the 'text' column
data['tokenized_text'] = data['text'].apply(lambda x : simple_preprocess(x.lower())) 
data.head(5)

Unnamed: 0,id,created_at,text,tokenized_text
0,849636868052275200,2017-04-05 14:56:29,bAnd so the robots spared humanity,"[band, so, the, robots, spared, humanity]"
1,848988730585096192,2017-04-03 20:01:01,bForIn2020 waltmossberg mims defcon5 Exactly T...,"[bforin, waltmossberg, mims, defcon, exactly, ..."
2,848943072423497728,2017-04-03 16:59:35,bwaltmossberg mims defcon5 Et tu Walt,"[bwaltmossberg, mims, defcon, et, tu, walt]"
3,848935705057280001,2017-04-03 16:30:19,bStormy weather in Shortville,"[bstormy, weather, in, shortville]"
4,848416049573658624,2017-04-02 06:05:23,bDaveLeeBBC verge Coal is dying due to nat gas...,"[bdaveleebbc, verge, coal, is, dying, due, to,..."


In [199]:
embedding_dict = {}
with open('./glove.6B.300d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors
f.close()

In [200]:
def vectorization(tokenized_sentence):
    result = []
    for token in tokenized_sentence:
        if(token in embedding_dict.keys()):
            result.append(embedding_dict[token])
    return np.mean(result, axis=0)

In [201]:
data['vectorized_text'] = data['tokenized_text'].apply(vectorization)

  out=out, **kwargs)


In [202]:
data['vectorized_text'].head(5)

0    [-0.069005005, 0.1265175, -0.17781524, -0.2292...
1    [-0.04908701, 0.15421282, -0.019111756, 0.1046...
2    [-0.086360596, 0.021570003, 0.033818595, 0.172...
3    [-0.22167821, -0.28544, -0.3566, -0.272615, -0...
4    [0.13943791, 0.20099533, 0.083782665, -0.12666...
Name: vectorized_text, dtype: object

### 3- Apply word embeddings to the search query.

In [203]:
sentence = "Let's go Bryan"
vectorized_sentence = vectorization(sentence)
vectorized_sentence[:14]

array([-0.16498132, -0.0360894 , -0.22653618, -0.07282434, -0.6516655 ,
        0.0167934 , -0.16408329,  0.2117085 , -0.33325586, -1.0505891 ,
       -0.0386307 ,  0.04003889, -0.44517103,  0.21720381], dtype=float32)

### 4- Calculate the distance between the embeddings of the search query and that of all the tweets

In [204]:
tweet = data['vectorized_text'].iloc[0]
tweet_bis = data['vectorized_text'].iloc[10]

In [205]:
from scipy.spatial import distance

# This is used to calculate the the cosine similarity
cosine_sim = 1 - distance.cosine(tweet, tweet_bis)
print(cosine_sim)

0.7164156436920166


In [206]:
euclidean_sim = distance.euclidean(tweet, tweet_bis)
print(euclidean_sim)

2.3490452766418457


In [207]:
tweet = tweet.reshape(tweet.shape[0], 1)
tweet_bis = tweet.reshape(tweet_bis.shape[0], 1)

In [208]:
print(tweet.shape)
print(tweet_bis.shape)

(300, 1)
(300, 1)


In [209]:
from sklearn.metrics.pairwise import euclidean_distances

euclidean_distances(tweet, tweet_bis)

array([[0.        , 0.19552252, 0.10881024, ..., 0.17932352, 0.08122475,
        0.0042855 ],
       [0.19552252, 0.        , 0.30433273, ..., 0.01619899, 0.11429776,
        0.19123702],
       [0.10881024, 0.30433273, 0.        , ..., 0.28813377, 0.190035  ,
        0.11309574],
       ...,
       [0.17932352, 0.01619899, 0.28813377, ..., 0.        , 0.09809877,
        0.17503802],
       [0.08122475, 0.11429776, 0.190035  , ..., 0.09809877, 0.        ,
        0.07693925],
       [0.0042855 , 0.19123702, 0.11309574, ..., 0.17503802, 0.07693925,
        0.        ]], dtype=float32)

#### Top 5 tweets relating to this query for cosine similarity algorithm 

In [210]:
data['cosine_sim'] = data['vectorized_text'].apply(lambda x: 1 - distance.cosine(x, vectorized_sentence))
data.sort_values(by=['cosine_sim'], ascending=False).head(5)

Unnamed: 0,id,created_at,text,tokenized_text,vectorized_text,cosine_sim
1571,594756342641922048,2015-05-03 06:52:16,bWhile I like the initials Id take the socalle...,"[bwhile, like, the, initials, id, take, the, s...","[-0.20209053, 0.15966749, -0.12788168, -0.1864...",0.508909
2531,259527804339769344,2012-10-20 05:33:45,b TeslaMotors Tesla Supercharger network opens...,"[teslamotors, tesla, supercharger, network, op...","[-0.07228046, 0.08442138, -0.1184677, -0.10358...",0.508432
225,828297251026001924,2017-02-05 17:40:28,bmarcelsalathe Not perfect amp cd be better bu...,"[bmarcelsalathe, not, perfect, amp, cd, be, be...","[-0.07211473, 0.1277933, -0.07457692, -0.13254...",0.5028
2168,338115331891556352,2013-05-25 02:12:31,bTechnically I got rich from Zip2 amp PayPal w...,"[btechnically, got, rich, from, zip, amp, payp...","[-0.0480614, 0.02388083, -0.081196524, -0.0241...",0.502199
712,768266353572585472,2016-08-24 01:59:07,b TeslaMotors Introducing P100D with Ludicrous...,"[teslamotors, introducing, with, ludicrous, mo...","[-0.13674676, 0.28292993, 0.07578609, -0.08817...",0.501733


#### Top 5 tweets relating to this query for euclidean distance algorithm 

In [211]:
data['euclidean_sim'] = data['vectorized_text'].dropna().apply(lambda x: distance.euclidean(x, vectorized_sentence))
data.sort_values(by=['euclidean_sim'], ascending=True).head(5)

Unnamed: 0,id,created_at,text,tokenized_text,vectorized_text,cosine_sim,euclidean_sim
2531,259527804339769344,2012-10-20 05:33:45,b TeslaMotors Tesla Supercharger network opens...,"[teslamotors, tesla, supercharger, network, op...","[-0.07228046, 0.08442138, -0.1184677, -0.10358...",0.508432,3.800812
1571,594756342641922048,2015-05-03 06:52:16,bWhile I like the initials Id take the socalle...,"[bwhile, like, the, initials, id, take, the, s...","[-0.20209053, 0.15966749, -0.12788168, -0.1864...",0.508909,3.803121
2168,338115331891556352,2013-05-25 02:12:31,bTechnically I got rich from Zip2 amp PayPal w...,"[btechnically, got, rich, from, zip, amp, payp...","[-0.0480614, 0.02388083, -0.081196524, -0.0241...",0.502199,3.806347
2529,262365909757358080,2012-10-28 01:31:22,bDoing LA to SF road trip in Model S with all ...,"[bdoing, la, to, sf, road, trip, in, model, wi...","[0.006311951, 0.22930056, -0.059599612, -0.018...",0.496812,3.843287
2451,295162137297428480,2013-01-26 13:31:52,bDesire to help Boeing is real amp am correspo...,"[bdesire, to, help, boeing, is, real, amp, am,...","[-0.022214685, 0.086267844, -0.019535271, -0.1...",0.500602,3.852934


In [212]:
# data['euclidean_sim'] = data['vectorized_text'].dropna().apply(lambda x: euclidean_distances(x, vectorized_sentence))
# data.sort_values(by=['euclidean_sim'], ascending=True).head(5)