In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly
from collections import Counter
import re
import string 

import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import * 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
test= pd.read_csv("./output/final_tweets.csv")

In [3]:
list(test.columns)

['target',
 'ids',
 'date',
 'flag',
 'user',
 'text',
 'procesado',
 'Polaridad',
 'Subjetividad',
 'palabras']

In [4]:
df_tweet= test.copy()

In [5]:
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fernandodelgadoteran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def processTweet(tweet):
    """
    Takes in a string of text, then performs the following:
    1. Removes links, special characters and other bulk cleaning
    2. Returns a list of the tidy text
    """
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    return tweet
# tokenize helper function
def text_process(tweet):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in list(tweet) if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')]

# Lexicon normalisation with Stemming 
def stemming(tokens):
  """
  Takes in a string of text, then performs the following:
  1. Replace words for its root based on orter Stemmer rule.
  2. Returns normalised text
   """
  stemmer = PorterStemmer()
  x = [stemmer.stem(w) for w in tokens]
   
  return ' '.join(x)

In [7]:
# Preprocessing
df_tweet['mensajeprocesado'] = df_tweet['text'].apply(processTweet)
print(df_tweet.mensajeprocesado.head())
# Double check
df_tweet['mensajeprocesado'] = df_tweet['text'].str.replace("[^a-zA-Z#]", " ") 
print(df_tweet.mensajeprocesado.head())
# tokenize tidy_tweet column and create a column for tokens
test['tokens'] = df_tweet['procesado'].copy() # tokenize

# Normalisation
stemmer = PorterStemmer() 
normalized_tweet = df_tweet['mensajeprocesado'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

for i in range(len(normalized_tweet)):
    normalized_tweet[i] = ''.join(normalized_tweet[i])    
df_tweet['mensajeprocesado'] = normalized_tweet

df_tweet.drop(df_tweet.filter(regex="Unname"),axis=1, inplace=True)

0    fell asleep around last night woke around . ca...
1    going town later get prom shoes and other thin...
2                                           happy day 
3                            tummy hurts stupid stress
4                 are you the event? haven' seen you. 
Name: mensajeprocesado, dtype: object
0     MissScion Fell asleep around  p last night wo...
1     katie andhearts going in town later to get my...
2                                           happy day 
3                           Tummy Hurts  Stupid Stress
4     jodywallace  Are you at the LF event   I have...
Name: mensajeprocesado, dtype: object


In [8]:
tweet = df_tweet['text'].apply(processTweet)
nb_words = 10000  
tk = Tokenizer(num_words=nb_words) #tokenize
tk.fit_on_texts(tweet) #tokenize

# format your input for the neural net
tweets_seq = tk.texts_to_sequences(tweet) # integer encode
tweet_array = pad_sequences(tweets_seq, # good to use length it was trained on
                            maxlen=39) # Convert to 2-D Numpy array

In [9]:
from keras.models import load_model

In [10]:
LSTM_model = load_model('./modelos/LSTM_model.h5')
LSTM_model.summary()

Model: "modeloKerasreg"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 128)           1280000   
_________________________________________________________________
dropout (Dropout)            (None, 28, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               263200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 3)                 603       
_________________________________________________________________
dropout_2 (Dropout)          (None, 3)                 0         
Total params: 1,543,803
Trainable params: 1,543,803
Non-trainable params: 0
__________________________________________

In [11]:
batch_size = len(tweet_array)

# for human-friendly printing
labels = ['negative', 'positive']

# Predict and get output from the model
pred= LSTM_model.predict_classes(tweet_array, batch_size)

# append predictions to dataframe
df_tweet['predictions'] = pred
df_tweet.shape

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


(10000, 12)

In [22]:
df_tweet[df_tweet['predictions'] == 2].head(5) # negative

Unnamed: 0,target,ids,date,flag,user,text,procesado,Polaridad,Subjetividad,palabras,mensajeprocesado,predictions
0,Negativa,2014943573,Wed Jun 03 03:11:56 PDT 2009,NO_QUERY,bexodus,@MissScion Fell asleep around 9p last night wo...,"['fell', 'asleep', 'around', 'last', 'night', ...",-0.016667,0.488889,26,MissScion Fell asleep around p last night wo...,2
6,Neutral,2001267352,Tue Jun 02 00:20:21 PDT 2009,NO_QUERY,primetheus,sigh ... what i would give to be on a beach so...,"['sigh', 'would', 'give', 'beach', 'somewhere']",0.0,0.0,12,sigh what i would give to be on a beach so...,2
7,Positiva,2178458794,Mon Jun 15 07:43:55 PDT 2009,NO_QUERY,emfeha,Check my new twitter background. Portfolio v5 ...,"['check', 'new', 'twitter', 'background', 'por...",0.136364,0.454545,12,Check my new twitter background Portfolio v ...,2
8,Positiva,1792192715,Wed May 13 22:24:34 PDT 2009,NO_QUERY,Alex_Manchester,@collabguy Very interesting. I can't touch ty...,"['interesting', 'touch', 'type', 'properly', '...",0.233333,0.4,23,collabguy Very interesting I can t touch ty...,2
9,Neutral,2302731266,Tue Jun 23 17:13:02 PDT 2009,NO_QUERY,YarnThing,@iambrianna did you leave it on my cell? I can...,"['leave', 'cell', 'find']",0.0,0.0,15,iambrianna did you leave it on my cell I can...,2


In [13]:
df_tweet[df_tweet['predictions'] == 1].head(5) # neutral

Unnamed: 0,target,ids,date,flag,user,text,procesado,Polaridad,Subjetividad,palabras,mensajeprocesado,predictions
1,Positiva,1971796339,Sat May 30 07:55:06 PDT 2009,NO_QUERY,sazmataz_x,@katie_andhearts going in town later to get my...,"['going', 'town', 'late', 'get', 'prom', 'shoe...",0.25,0.65,14,katie andhearts going in town later to get my...,1
2,Positiva,1978132662,Sat May 30 22:31:29 PDT 2009,NO_QUERY,Deasoy,happy day,"['happy', 'day']",0.8,1.0,2,happy day,1
3,Negativa,1679901725,Sat May 02 10:26:43 PDT 2009,NO_QUERY,SashaPanda,Tummy Hurts Stupid Stress,"['tummy', 'hurts', 'stupid', 'stress']",-0.8,1.0,4,Tummy Hurts Stupid Stress,1
4,Neutral,2055154954,Sat Jun 06 08:48:06 PDT 2009,NO_QUERY,LianeGentrySkye,@jodywallace Are you at the LF event? I have...,"['event', 'seen']",0.0,0.0,11,jodywallace Are you at the LF event I have...,1
5,Positiva,2066500448,Sun Jun 07 10:38:18 PDT 2009,NO_QUERY,darkrumblog,"@Fleshworks Excellent, very good indeed! Am p...","['excellent', 'good', 'indeed', 'posting']",0.85,0.8,9,Fleshworks Excellent very good indeed Am p...,1


In [21]:
df_tweet[df_tweet['predictions'] == 0].head(10) # positivo

Unnamed: 0,target,ids,date,flag,user,text,procesado,Polaridad,Subjetividad,palabras,mensajeprocesado,predictions
10,Positiva,2047799782,Fri Jun 05 14:11:57 PDT 2009,NO_QUERY,Chicagogirl1996,The most wonderful thing happend today are car...,"['wonderful', 'thing', 'happend', 'today', 'ca...",1.0,1.0,10,The most wonderful thing happend today are car...,0
19,Positiva,1881440898,Fri May 22 04:40:20 PDT 2009,NO_QUERY,KathrynBriggs,Starting to panic about impending first semest...,"['starting', 'panic', 'impending', 'first', 's...",0.15,0.211111,22,Starting to panic about impending first semest...,0
20,Positiva,2072090964,Sun Jun 07 20:23:09 PDT 2009,NO_QUERY,janicepcheng,@aronsolomon I love how you always get my name...,"['love', 'always', 'get', 'names', 'wrong', 'j...",0.36,0.8,21,aronsolomon I love how you always get my name...,0
24,Positiva,2044347056,Fri Jun 05 09:13:30 PDT 2009,NO_QUERY,RaqC,Up and running early! Headed 2 court 2 get a d...,"['running', 'early', 'headed', 'court', 'get',...",0.45,0.525,27,Up and running early Headed court get a d...,0
29,Negativa,2204027282,Wed Jun 17 01:08:03 PDT 2009,NO_QUERY,williams1993,Feels like crap! Im such an eejit sometimes!,"['feels', 'like', 'crap', 'eejit', 'sometimes']",-0.8,0.8,8,Feels like crap Im such an eejit sometimes,0
33,Negativa,2014795190,Wed Jun 03 02:42:46 PDT 2009,NO_QUERY,aussiegirl_1996,Got the WORST migraine at skool 2day.A bit dra...,"['got', 'bad', 'migraine', 'skool', 'bit', 'dr...",-0.344444,0.522222,18,Got the WORST migraine at skool day A bit dra...,0
38,Positiva,2048072672,Fri Jun 05 14:37:13 PDT 2009,NO_QUERY,krystyl,"For a low low price of $35,000.00 you too can ...","['low', 'low', 'price', 'hammock', 'cool', 'bu...",0.116667,0.416667,26,For a low low price of you too can ...,0
46,Positiva,1983596849,Sun May 31 13:27:46 PDT 2009,NO_QUERY,allora,@Gastro1 just checked out their site..the clos...,"['checked', 'site', 'close', 'shop', 'berlin',...",0.1,1.0,14,Gastro just checked out their site the clos...,0
51,Negativa,1684849864,Sat May 02 22:14:18 PDT 2009,NO_QUERY,KBernice,PACQUIAO! Hatton got owned! Second round too!,"['pacquiao', 'hatton', 'got', 'owned', 'second...",-0.1,0.2,7,PACQUIAO Hatton got owned Second round too,0
54,Negativa,2066820884,Sun Jun 07 11:13:55 PDT 2009,NO_QUERY,Alica87,"@bubblything Dear, I'm so sorry, I forgot that...","['dear', 'sorry', 'forgot', 'understand', 'ger...",-0.25,0.5,17,bubblything Dear I m so sorry I forgot that...,0


In [18]:
positives = df_tweet['predictions'][df_tweet.predictions == 2]
neutral = df_tweet['predictions'][df_tweet.predictions == 1]
negatives = df_tweet['predictions'][df_tweet.predictions == 0]

print('number of positve tagged sentences is:  {}'.format(len(positives)))
print('number of neutral tagged sentences is: {}'.format(len(neutral)))
print('number of negative tagged sentences is: {}'.format(len(negatives)))
print('total length of the data is:            {}'.format(df_tweet.shape[0]))

number of positve tagged sentences is:  3625
number of neutral tagged sentences is: 4663
number of negative tagged sentences is: 1712
total length of the data is:            10000


In [19]:
df_tweet.to_csv('./output/df_tweets_predictions.csv', index=False)

1    4663
2    3625
0    1712
Name: predictions, dtype: int64