<img src="images/thro.png" align="right"> 
# # A2I2 - Natural Language Processing (NLP)

## <span style="color:red">Lecture - Part 2: Similarity and Similarity Search</span>

## <span style="color:red">Exercise Solution</span>
---

#### Setup

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import webtext
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import twitter_samples
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
nltk.download('webtext')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('twitter_samples')

[nltk_data] Downloading package webtext to /home/jovyan/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
# read the preprocessed data
with open('tweets_lem.data', 'rb') as filehandle:
    tweets_lem = pickle.load(filehandle)
    
tweets_raw = twitter_samples.strings('tweets.20150430-223406.json')

# TF-IDF

In [4]:
# compute the word counts for each document
cv=CountVectorizer(analyzer=lambda x:x)
word_count_vector=cv.fit_transform(tweets_lem)
feature_names = cv.get_feature_names()

print(word_count_vector.shape)

show = 9
# get count vector for one of the documents
show_doc_vector=word_count_vector[show]

# print the count
df = pd.DataFrame(show_doc_vector.T.todense(), index=feature_names, columns=["count"])
print(tweets_lem[show])
print(df.sort_values(by=["count"],ascending=False)[:10])


(20000, 19259)
['lolz', 'trickle', 'wealth', 'never', 'trickling', 'past', 'wallet', 'greed', 'always', 'win', '$', '$', '$', 'greedy', 'https://t.co/x7deopbs97']
           count
$              3
wallet         1
greed          1
win            1
trickle        1
trickling      1
wealth         1
always         1
lolz           1
greedy         1


In [5]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

# print the lowest and highest idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf"])
print(df_idf.sort_values(by=['idf'])[:10])
print(df_idf.sort_values(by=['idf'])[-10:])

               idf
tory      2.254790
miliband  2.332614
snp       2.642269
ed        2.998096
#bbcqt    3.035283
labour    3.093590
cameron   3.124071
farage    3.413008
david     3.489567
ukip      3.493796
                      idf
econonomically   10.21039
econs            10.21039
eden             10.21039
edgy             10.21039
edit             10.21039
editor-in-chief  10.21039
editorship       10.21039
edm's            10.21039
eejits           10.21039
󾌡                10.21039


In [6]:
# note that many of the very frequent words have low idf values, i.e. they appear in many
# reviews

In [7]:
# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(word_count_vector)

show = 0
# get tfidf vector for first document
show_doc_vector=tf_idf_vector[show]

#print the scores
df = pd.DataFrame(show_doc_vector.T.todense(), index=feature_names, columns=["tfidf"])
print(tweets_lem[show])
print(df.sort_values(by=["tfidf"],ascending=False)[:20])

['@kirkkus', 'indirect', 'cost', 'uk', 'eu', 'estimated', 'costing', 'britain', '£', '170', 'billion', 'per', 'year', '#betteroffout', '#ukip']
                            tfidf
170                      0.340148
indirect                 0.340148
estimated                0.340148
costing                  0.326640
#betteroffout            0.317057
@kirkkus                 0.298414
per                      0.270874
billion                  0.252231
cost                     0.229740
britain                  0.188092
£                        0.187235
eu                       0.172456
year                     0.160769
uk                       0.155351
#ukip                    0.129221
https://t.co/dmhq9thfs1  0.000000
https://t.co/dkj7keqtvx  0.000000
https://t.co/dnob4nyi0o  0.000000
https://t.co/dk9nyyr3qi  0.000000
#                        0.000000


# Compute similar tweets

In [8]:
similarities = cosine_similarity(tf_idf_vector)

In [9]:
index = 10
df = pd.DataFrame(similarities[index], index=tweets_raw, columns=["similarity"])
df['#']=np.arange(0, len(df))
df.sort_values(by=["similarity"],ascending=False)[:20]

Unnamed: 0,similarity,#
SNP leader faces audience questions http://t.co/TYClKltSpW,1.0,10
SNP leader faces audience questions,0.696482,9217
SNP leader faces audience questions,0.696482,12016
SNP leader faces audience questions http://t.co/1pzgD69P7Q,0.485088,511
SNP leader faces audience questions http://t.co/HYEsnF4HOF,0.485088,15710
SNP leader faces audience questions http://t.co/f8rfiubQQ7,0.485088,9189
SNP leader faces audience questions http://t.co/KBoQ8qX0wV,0.485088,9223
SNP leader faces audience questions - http://t.co/jcNfuX5QHE,0.485088,3421
SNP leader faces audience questions http://t.co/5CBmT3j4DY,0.485088,5296
SNP leader faces audience questions http://t.co/2NmsmSMJLB,0.485088,16508


# Word2Vec

In [10]:
import gensim
import gensim.downloader as api

In [11]:
# load a pretrained word embedding model - this one has 400.000 words with vectors of
# length 50 and has been trained on the wikipedia from 2014 plus the Gigaword 5 dataset
# see https://github.com/RaRe-Technologies/gensim-data
# and https://catalog.ldc.upenn.edu/LDC2011T07
model = api.load("glove-wiki-gigaword-50")

In [12]:
# remove all words not in the pre-trained vocabulary
tweets_vo = [[t for t in tweet if t in model.vocab] for tweet in tweets_lem]

In [13]:
# check if there are "empty" tweets now, i.e. tweets without any words
len([len(tweet) for tweet in tweets_vo if len(tweet)==0])

146

In [14]:
# remove all these empty tweets (from both the word vectors and the original data)
notempty = [len(tweet)>0 for tweet in tweets_vo]
tweets_fwc = np.array(tweets_vo)[notempty]
tweets_raw_fwc = np.array(tweets_raw)[notempty]
print(len(tweets_fwc))
print(len(tweets_raw_fwc))

19854
19854


In [15]:
# compute the document vectors by averaging the word vectors
rr_wv = [np.mean([model[w] for w in r if w in model.vocab], axis=0) for r in tweets_fwc]

In [16]:
# compute the cosine-similarity matrix
sim_dv = cosine_similarity(rr_wv)

In [17]:
# find the most similar tweets for tweet # 10
index = 10
df = pd.DataFrame(sim_dv[index], index=tweets_raw_fwc, columns=["similarity"])
df['#']=np.arange(0, len(df))
df.sort_values(by=["similarity"],ascending=False)[:20]

Unnamed: 0,similarity,#
SNP leader faces audience questions http://t.co/MNrBe7qYhQ,1.0,16859
SNP leader faces audience questions,1.0,11945
SNP leader faces audience questions - http://t.co/jcNfuX5QHE,1.0,3402
SNP leader faces audience questions http://t.co/5CBmT3j4DY,1.0,5270
SNP leader faces audience questions http://t.co/f8rfiubQQ7,1.0,9144
SNP leader faces audience questions http://t.co/gk6yJ9zXNx,1.0,67
SNP leader faces audience questions #Scotland #BBC http://t.co/R8NmSOnnt2,1.0,15843
SNP leader faces audience questions http://t.co/TYClKltSpW,1.0,10
SNP leader faces audience questions http://t.co/HYEsnF4HOF,1.0,15613
SNP leader faces audience questions http://t.co/KBoQ8qX0wV,1.0,9178


In [18]:
# EOF