In [65]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly
from collections import Counter
import re
import string 

import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import * 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [66]:
test= pd.read_csv("./Outputs/FilteredDataSet.csv")

In [67]:
list(test.columns)

['date',
 'user_name',
 'user_loc',
 'message',
 'full_name',
 'country',
 'country_code',
 'geo_code',
 'longitud_mensaje',
 'palabras',
 'refinado']

In [68]:
test.head()

Unnamed: 0,date,user_name,user_loc,message,full_name,country,country_code,geo_code,longitud_mensaje,palabras,refinado
0,2020-10-10 11:41:36,susanha77835097,"Florida, USA",@blossomingabyss @JoeBiden He filled his pocke...,"Sunny Isles Beach, FL",United States,US,[-80.125071 25.92906 ],140,18,"['filled', 'pocketskids', 'familyharris', 'unq..."
1,2020-10-10 11:19:05,tomborelli,"Greenwich, CT",Supporting @JoeBiden means packing the Supreme...,"Greenwich, CT",United States,US,[-73.63941 41.050217],144,17,"['supporting', 'means', 'packing', 'supreme', ..."
2,2020-10-10 09:39:10,John_Di_Lemme,(561) 847-3467,@ABC13News @realDonaldTrump @JoeBiden Nancy Pe...,"West Palm Beach, FL",United States,US,[-80.12262 26.721896],133,15,"['nancy', 'pelosi', 'nuts', 'joe', 'biden', 'k..."
3,2020-10-10 08:39:14,Parnell_100,United Kingdom,@JoeBiden There is NO STOPPING this AMERICAN T...,"Belfast, Northern Ireland",United Kingdom,GB,[-5.928413 54.595869],140,23,"['stopping', 'american', 'trinity', 'train', '..."
4,2020-10-10 07:18:33,sqlblues,"Weston-super-Mare, England",@FenrirWolf26 @Stanhope2011VJ Britons have als...,"Weston-super-Mare, England",United Kingdom,GB,[-2.94513 51.346796],139,17,"['britons', 'also', 'pumped', 'relentless', 'd..."


In [69]:
df_tweet= test.copy()

In [70]:
tweets = df_tweet['message']

In [71]:
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/danielgarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:


def processTweet(tweet):
    """
    Takes in a string of text, then performs the following:
    1. Removes links, special characters and other bulk cleaning
    2. Returns a list of the tidy text
    """
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    return tweet
# tokenize helper function
def text_process(tweet):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in list(tweet) if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')]

# Lexicon normalisation with Stemming 
def stemming(tokens):
  """
  Takes in a string of text, then performs the following:
  1. Replace words for its root based on orter Stemmer rule.
  2. Returns normalised text
   """
  stemmer = PorterStemmer()
  x = [stemmer.stem(w) for w in tokens]
   
  return ' '.join(x)

In [112]:
# Preprocessing
df_tweet['mensajeprocesado'] = df_tweet['message'].apply(processTweet)
print(df_tweet.mensajeprocesado.head())
# Double check
df_tweet['mensajeprocesado'] = df_tweet['mensajeprocesado'].str.replace("[^a-zA-Z#]", " ") 
print(df_tweet.mensajeprocesado.head())
# tokenize tidy_tweet column and create a column for tokens
test['tokens'] = df_tweet['refinado'].copy() # tokenize

# Normalisation
stemmer = PorterStemmer() 
normalized_tweet = df_tweet['mensajeprocesado'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

for i in range(len(normalized_tweet)):
    normalized_tweet[i] = ''.join(normalized_tweet[i])    
df_tweet['mensajeprocesado'] = normalized_tweet

df_tweet.drop(df_tweet.filter(regex="Unname"),axis=1, inplace=True)

0    filled his pocketskids familyharris unqualifie...
1    supporting means packing the supreme court lib...
2    nancy pelosi nuts! joe biden and kamala harris...
3    there stopping this american trinity train - b...
4    britons have also been pumped relentless diet ...
Name: mensajeprocesado, dtype: object
0    filled his pocketskids familyharris unqualifie...
1    supporting means packing the supreme court lib...
2    nancy pelosi nuts  joe biden and kamala harris...
3    there stopping this american trinity train   b...
4    britons have also been pumped relentless diet ...
Name: mensajeprocesado, dtype: object


In [113]:
tweet = df_tweet['message'].apply(processTweet)
nb_words = 10000  
tk = Tokenizer(num_words=nb_words) #tokenize
tk.fit_on_texts(tweet) #tokenize

# format your input for the neural net
tweets_seq = tk.texts_to_sequences(tweet) # integer encode
tweet_array = pad_sequences(tweets_seq, # good to use length it was trained on
                            maxlen=39) # Convert to 2-D Numpy array


In [114]:
from keras.models import load_model

In [115]:
LSTM_model = load_model('./models/LSTM_model.h5')
LSTM_model.summary()

Model: "modeloKerasreg"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 28, 128)           1280000   
_________________________________________________________________
dropout_6 (Dropout)          (None, 28, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               263200    
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 603       
_________________________________________________________________
dropout_8 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 3)              

In [116]:
batch_size = len(tweet_array)

# for human-friendly printing
labels = ['negative', 'positive']

# Predict and get output from the model
pred= LSTM_model.predict_classes(tweet_array, batch_size)

# append predictions to dataframe
df_tweet['predictions'] = pred
df_tweet.shape




(2504, 13)

In [119]:
df_tweet[df_tweet['predictions'] == 2].head(5) # negative

Unnamed: 0,date,user_name,user_loc,message,full_name,country,country_code,geo_code,longitud_mensaje,palabras,refinado,predictions,mensajeprocesado
0,2020-10-10 11:41:36,susanha77835097,"Florida, USA",@blossomingabyss @JoeBiden He filled his pocke...,"Sunny Isles Beach, FL",United States,US,[-80.125071 25.92906 ],140,18,"['filled', 'pocketskids', 'familyharris', 'unq...",2,filled his pocketskids familyharris unqualifie...
3,2020-10-10 08:39:14,Parnell_100,United Kingdom,@JoeBiden There is NO STOPPING this AMERICAN T...,"Belfast, Northern Ireland",United Kingdom,GB,[-5.928413 54.595869],140,23,"['stopping', 'american', 'trinity', 'train', '...",2,there stopping this american trinity train b...
9,2020-10-10 05:19:12,cyn507,"Philadelphia, PA",Looks like @realDonaldTrump is running scared....,"Philadelphia, PA",United States,US,[-75.117998 40.004866],140,20,"['looks', 'like', 'running', 'scared', 'bag', ...",2,looks like running scared his bag tricks does...
40,2020-10-10 01:22:40,KLehneiswxguy,United States,@NBSaphierMD @JoeBiden Because Joe Biden is ch...,"Beavercreek, OH",United States,US,[-84.047553 39.727936],139,20,"['joe', 'biden', 'chickening', 'wanted', 'othe...",2,because joe biden chickening out wanted other...
46,2020-10-10 00:29:13,Sp8d,"Houston, TX",@psirus2020 @zeroemissionnow @DEJH69619837 @Jo...,"Houston, TX",United States,US,[-95.446486 29.838495],125,17,"['saying', 'biden', 'known', 'racist', 'defend...",2,are you saying biden known racist while you d...


In [123]:
positives = df_tweet['predictions'][df_tweet.predictions == 2]
neutral = df_tweet['predictions'][df_tweet.predictions == 1]
negatives = df_tweet['predictions'][df_tweet.predictions == 0]

print('number of positve tagged sentences is:  {}'.format(len(positives)))
print('number of neutral tagged sentences is: {}'.format(len(neutral)))
print('number of negative tagged sentences is: {}'.format(len(negatives)))
print('total length of the data is:            {}'.format(df_tweet.shape[0]))

number of positve tagged sentences is:  498
number of neutral tagged sentences is: 1714
number of negative tagged sentences is: 292
total length of the data is:            2504
