# Doc2vec + Cosine Similarity

In [1]:
import pandas as pd
import re
import spacy
import string
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Mounted at /content/drive/


In [2]:
data_Final = pd.read_csv(f"{PATH}final_Tweets.csv")
data = data_Final.copy()
data_Final.head()

Unnamed: 0,created_at,entities.hashtags,favorite_count,full_text,id,retweet_count,user.id,user.name
0,2020-11-11,[],4,International friendly roundup: Finland stun F...,1326667371730378753,1,16042794,Guardian US
1,2020-11-11,[],11,When Joe Biden formally takes over the preside...,1326666012142526466,5,16042794,Guardian US
2,2020-11-11,[],4,New Yorker fires Jeffrey Toobin after he repor...,1326663505454510081,1,16042794,Guardian US
3,2020-11-11,[],8,One week on: how has Donald Trump handled losi...,1326661105498796032,1,16042794,Guardian US
4,2020-11-11,[],13,France pays tribute to six-year-old resistance...,1326659924278046728,6,16042794,Guardian US


## Text Prepocessing

* Lowercase the text
* Expand Contractions
* Clean the text
* Remove Stopwords
* Lemmatize words

In this method we have decided to implement a stronger text processing to retain as much as possible relations and semantics between words.

In [5]:
def expand_contractions(text, contractions_dict, contractions_re):
    """
    Given contraction find match and substitude
    """
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace,text)

def clean_text(text):
    """
    * Remove words with digits
    * Replace newline characters with space
    * Remove URLS
    * Replace non english chars with space
    """
    # Remove digits
    text=re.sub('\w*\d\w*','', text)

    # Remove new Line chars
    text=re.sub('\n',' ',text)

    #Remove links
    text=re.sub(r"http\S+", "", text)

    #Replace non-english chars
    text=re.sub('[^a-z]',' ',text)
    
    return text

def preprocessing(text):
    """
    Given a pandas dataframe apply preprocessing techinques
        * Lowercase the text
        * Expand Contractions
        * Clean the text
        * Remove Stopwords
        * Lemmatize words
    """
    # Lower case
    text = text.lower()

    # Regular expression for finding contractions
    contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    #Expand contractions
    text = expand_contractions(text,contractions_dict,contractions_re)
    text = clean_text(text)

    #Remove added spaces
    text = re.sub(" +"," ",text)
    text = text.strip()

    #Stop words and Lemmatizing
    text = ' '.join([token.lemma_ for token in list(nlp(text)) if (token.is_stop==False)])

    return text

In [4]:
# Init NLP
nlp = spacy.load("en_core_web_sm",disable=["ner","parser"])
nlp.max_length = 5000000

# Import contractons dict
contractions_dict = pickle.load(open(f"{PATH}utils/contractions_dict.p","rb"))

# Preprocess tweets
data["full_text"]  = data["full_text"].apply(lambda x: preprocessing(x))

In [15]:
data.to_csv(f"{PATH}d2v_processed.csv", index = False)

# Doc2Vec

In [6]:
# List  tokenized documents
documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(data["full_text"])]

# tag_id per tweet
tag_id = {k:v for k,v in enumerate(data["id"])}
pickle.dump(tag_id, open(f"{PATH}utils/tag_id.p","wb"))

# Create the model
doc2vec_model = Doc2Vec(dm=1, vector_size=100,window=3,hs=0,min_count=1,dbow_words=1)

# Build Vocabulary
doc2vec_model.build_vocab(documents)

# Train
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=100, start_alpha=0.01, end_alpha=0.01)

In [13]:
# Save model
doc2vec_model.save(f"{PATH}utils/doc2vec_model")

In [14]:
# Dict with document id : vector representation
id_doc2vec = {k: v for k,v in zip(data["id"], doc2vec_model.docvecs)}

# Save
pickle.dump(id_doc2vec,open(f"{PATH}utils/id_doc2vec.p", "wb"))

In [9]:
def rank(query, tag_id):
    """
    Given a query preprocesses it, embeds it and return ordered dictionary of id:similarity_score
    pair.
    """
    # Pre-process query
    query = preprocessing(query)
  
    # Query vector
    q_vector = doc2vec_model.infer_vector(query.split())

    #Doc query similarity
    tag_sim = doc2vec_model.docvecs.most_similar([q_vector], topn = 20)

    # Get Ids
    ids = [tag_id[id[0]] for id in tag_sim] 
    
    return ids

In [10]:
def parser_tweet_results(doc):
  """
Given a Pandas dataframe row formates the information por display
Arguments:
  docs -- pandas dataframe with unique row with tweet info.
Returns:
  tweet -- text tweet - str
  authors -- user name of tweet - str
  date -- of publication -- str
  retweets -- count of retweets - str
  favorites -- count of favourites - str
  """
  # Tweet
  tweet = str(doc["full_text"].values)
  tweet = tweet.replace("'","")
  tweet = tweet.replace("[","")
  tweet = tweet.replace("]","")

  # Author
  author = str(doc["user.name"].values)
  author = author.replace("[","")
  author = author.replace("]","")

  # Date
  date = str(doc["created_at"].values)
  date = date.replace("[","")
  date = date.replace("]","")
  date = date.replace("'","")

  # Retweets
  retweets = str(doc["retweet_count"].values)
  retweets = retweets.replace("[","")
  retweets = retweets.replace("]","")

  # Favorites
  favorites = str(doc["favorite_count"].values)
  favorites = favorites.replace("[","")
  favorites = favorites.replace("]","")

  # URL
  id = str(doc["id"].values)
  id = id.replace("[","")
  id = id.replace("]","")
  url = f"https://twitter.com/twitter/statuses/{id}"

  #Hashtags
  hashtags = str(doc["entities.hashtags"].values)

  return tweet, date, author, retweets, favorites, url, hashtags

In [11]:
def search(tag_id,id_doc2vector, topn= 20):
    """
    Search for tweets inputing a query and see displayed results.
    Arguments:
        id_doc2vector -- dic containing id:vec2doc pair - dic
        topn -- default: 20 - Top N result to display - int.

    """
    print("######################################################")
    print("Insert query:")
    query = input()
    print("######################################################\n")
    
    # Get ranked docs
    doc_query_sim = rank(query, tag_id)
    ids = doc_query_sim[:topn]

    for index, id in enumerate(ids):
        doc = data_Final[data_Final["id"] == id]
        tweet, date, author, retweets, favorites, url, hashtags = parser_tweet_results(doc)
    
        print("______________________________________________________")
        print(f"Tweet {index}")
        print(f"\t·Author: {author}")
        print(f"\t·Date: {date}")
        print(f"\t·Tweet: {tweet}")
        print(f"\t·Retweets: {retweets}")
        print(f"\t·Favorites: {favorites}")
        print(f"\t·Hashtags: {hashtags}")
        print(f"\t·URL: {url}")
        print("______________________________________________________\n")

In [12]:
search(tag_id, id_doc2vec)

######################################################
Insert query:


KeyboardInterrupt: ignored