## Proyecto B: Calculadora de ReTweets

Inicialmente, debemos construir un dataset con la mayor cantidad de tweets en Colombia durante la última semana. Para ello, seleccionamos un total de 100604 tweets de la API de Twitter publicados del 15 al 21 de Febrero.

In [1]:
# Libraries
from gensim.models.word2vec import Word2Vec
import pandas as pd
import numpy as np
import gradio as gr
import re
import nltk
from unidecode import unidecode
from nltk import stem
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse.linalg import svds as SparseSVD
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
file = open('data/spanish_stop_words.txt',encoding='UTF-8')
SPANISH_STOP_WORDS = file.read().split()
file.close()

### Análisis de Tópicos

In [3]:
def preprocessing(d):
    # d -> document

    removed = re.sub(r'http\S+', '', d) # removes links from tweet
    words = removed.split()
    # Convert to lowercase every word  
    prep = [w.lower() for w in words]

    # Remove Spanish Stop Words
    prep = [w for w in prep if w not in SPANISH_STOP_WORDS]

    # Perform Unidecode normalization for removing accentuations, special characters etc.
    prep = [unidecode(w) for w in prep]

    # Remove special characters and punctuation
    prep = [re.sub('[^A-Za-z0-9]+', '', w) for w in prep]
    
    # Remove empty strings
    prep = [w for w in prep if w != '']
    
    # Lemmatization
    wnl = stem.WordNetLemmatizer()        
    prep = [wnl.lemmatize(w) for w in prep]
    
    
    processed_string = ' '.join(prep)
    if processed_string == "":
        return 'i' # just a neutral letter, that doesn't give much information as the empty string. This only happens when a tweet is fully contained out of spanish stop words
    return processed_string

# Testing HyperParameter

In [4]:
def vectorized_tweet(model, tweet):
    vectorized_tweet = []
    for word in tweet:
        vectorized_tweet.append(model.wv.get_vector(word))
    return np.average(vectorized_tweet,axis=0)

def BuildAndTrainModel(corpus, y, vector_size=500, min_count = 1, workers = 20, sg=1, window = 30, sample = 1e-6, random_seed = 42):
    '''Receives Corpus and y
    Corpus is a list of documents. In this context a list of tweets
    Y is the regression target for the given document. In this context, the number of retweets.
    INCLUDES PREPROCESSING
    Builds a word2vec model with given parameters, and then creates and trains Random Forest Regressor

    Returns: The trained word2vec model, and the fitted random forest regressor'''
    prep = [preprocessing(d) for d in corpus]
    
    sentences = []
    for document in prep:
        sentences.append(document.split())
    model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers, sg=sg, sample=sample,
            seed=random_seed)

    X = np.row_stack([vectorized_tweet(model, x) for x in sentences])
    regr = RandomForestRegressor(max_depth = 3, random_state = random_seed, n_jobs=-1)
    regr.fit(X,y)
    return model, regr

In [5]:
def getNewTweetVector(model, tweet): # Intented for a single tweet 
  tweet = preprocessing(tweet)
  words = tweet.split(' ')
  missing_words = [x for x in words if x not in model.wv.key_to_index]
  if len(missing_words) > 0:
    ## adding a word to the model
    model.build_vocab([missing_words], update=True)
    model.train([missing_words], total_examples=model.corpus_count, epochs=model.epochs)
  return vectorized_tweet(model, words)

def getNewTweetVectorsMultiple(model, tweets): # Intended for a multiple list of tweets, for more efficiency
    list_of_tweets = [preprocessing(tw) for tw in tweets]
    list_of_words = [tw.split(' ') for tw in list_of_tweets]
    new_sentences = [] # only add sentences which aren't already in the model.
    for sentence in list_of_words:
        for word in sentence:
          if word not in model.wv.key_to_index:
            new_sentences.append(sentence)
            break 
    
    if len(new_sentences) > 0:
        print(f"Adding {len(new_sentences)} new sentences to model")
        ## add new sentences to model
        model.build_vocab(new_sentences, update=True)
        model.train(new_sentences, total_examples=model.corpus_count, epochs=model.epochs)
    # get vectorized tweets with our newly updated model
    return np.row_stack([vectorized_tweet(model, x) for x in list_of_words])


In [6]:
def TestModelRMSE(model, regr, test_tweets, y_test):
    X_test = getNewTweetVectorsMultiple(model, test_tweets)
    y_predicted = regr.predict(X_test)
    return mean_squared_error(y_test, y_predicted, squared=False)

In [7]:
feb21 = pd.read_csv(f'data/period6.csv')
X_train, X_test, y_train, y_test = train_test_split(feb21["tweet"], feb21["retweet_count"], test_size=0.2, random_state=42)
model, regr = BuildAndTrainModel(X_train,y_train, vector_size=500, min_count = 1, workers = 1, sg=1, window = 30, sample = 1e-6, random_seed=42)
print("Root Mean Squared Error (RMSE) = ", TestModelRMSE(model, regr, X_test, y_test))

Adding 1479 new sentences to model
Root Mean Squared Error (RMSE) =  35.54985896748519


## Find most important words

In [8]:
words2retweet = pd.DataFrame(columns = ["words", "retweet_value"])
words2retweet["words"] = model.wv.index_to_key
words2retweet["retweet_value"] = regr.predict(model.wv.vectors)

In [9]:
pd.set_option("display.max_rows", None)
words2retweet.sort_values(by=['retweet_value'], ascending=False)

Unnamed: 0,words,retweet_value
14667,minecraft,1203.029311
1421,entidades,1126.668504
14007,enviarian,1088.852079
6915,exportar,785.650292
19608,perseguida,759.98264
22850,slideplayer,755.083079
22242,chivo,752.830783
4897,agredio,751.279234
3817,eduardobittar,749.765697
5115,viejita,745.618927
