### Important Note

Before selecting the model described bellow, different models and architectures were implemented.

The different models trained, along with their performance scores are available on the Readme file.

For space purpose, in this notebook, only the prefered model is presented

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import itertools
import seaborn as sn


# Confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
#Preparation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

#Layers
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import BatchNormalization
from keras.models import load_model
from keras.layers import Bidirectional
from keras.layers import Dropout

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#Removing bad canditates with tweets with length less than 3 and over 50
def check_length(tweet):
    length = len([word for word in tweet.split()])
    if (length > 3) & (length < 50) :
        return(True)
    else:
        return(False)

**Preparing the Dataset for Training**

* In order to reduce the size of the Happy and sad class, tweets that are less than 3 or over 50 words are removed.
* In addition, to keep the classes balanced, a sample of 15000 tweets is obtained from both classes

In [3]:
#Will add all preprocessing here!
#Scrapping the last set of tweets!
df = pd.read_csv('final1.csv')
del df['Unnamed: 0']

#for happy:
happy = df[df['Class'] == 'Happy']
happy['Check'] = happy.apply(lambda row : check_length(row['Tweet']) , axis = 1)
happy = happy[happy['Check'] == True].sample(15000)
#For sadness
sad = df[df['Class'] == 'Sadness']
sad['Check'] = sad.apply(lambda row : check_length(row['Tweet']) , axis = 1)
sad = sad[sad['Check'] == True].sample(15000) #Increasing the sample here

merged = happy.append(sad)
del merged['Check']


final = df.loc[(df['Tweet'].isin(merged['Tweet']))|(df['Class']=='Anger')|(df['Class']=='Fear')].drop_duplicates(['Tweet'])
classes = final.groupby('Class').count()
print(classes)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


         Tweet
Class         
Anger    11181
Fear      9533
Happy    14288
Sadness  13801


* Tokenizing
* Padding
* Encoding the labels
* Splitting into train-test 
* Initializing the vocabulary

In [4]:
#Data Preparation

#Lengths of max(for padding)
max_length = np.max([len(tweet.split()) for tweet in final['Tweet']])

#Tokenizing
tk = Tokenizer(lower=True , split=" ", filters='$%^*)(][_~}{:;\/') #filtering out some panchuations more
tk.fit_on_texts(final['Tweet'])
total_words = len(tk.word_index) + 1 #total words
#Converting
tweet_seq = tk.texts_to_sequences(final['Tweet'])

#padding
tweet_length = max_length
tweets_seq_pad = pad_sequences(tweet_seq , maxlen = tweet_length)

#Encoding the labels
le = LabelEncoder()
labels_encoded = le.fit_transform(final['Class'])
labels_cat = to_categorical(labels_encoded)

#Spliting train-test
xtrain , xtest , ytrain , ytest = train_test_split(tweets_seq_pad , labels_cat , test_size = 0.1 , random_state = 37)
vocabulary = np.zeros((total_words , 200)) # <-creating the voc matrix, its of shape (total_words,200) where 200 is the dim of glove




### Embedding Layer

As the Embedding Layer is extremely important for LSTM models , we will use the pre-trained GloVe layer, specialized for twitter's vocabulary.

The 200-dimensions Layer was selected.

In [5]:
#Preparing the embedding layer

print('Loading Vectors')
print('...............')

embeddings = dict()
f = open('D:/Big Data/project/embeddings/glove.twitter.27B.200d.txt', 'r', encoding = "utf-8")
for line in tqdm(f):
    values = line.split()
    word = values[0] #first element is the word
    #print("\n",word)
    coefs = np.asarray(values[1:],dtype='float32') #all the other are the 200 coefs
    embeddings[word] = coefs
f.close()
print('Loaded ', len(embeddings)," words vectors'")
for word,i in tk.word_index.items(): #from this dictionary i am getting the words
    word_vect = embeddings.get(word) # <- for this word i am getting each for the 200 values
    if word_vect is not None: # <- if the word exists in the dictionary and its not None
        vocabulary[i] = word_vect #<- i am adding it to the vocabulary,
#Creating the Embeddings layer:
embedding_layer = Embedding(input_dim = total_words, #My total words
                            output_dim = 200 ,         #The output is the number of dimensions for each word
                            weights = [vocabulary],    #The weights i have already defined
                           input_length = tweet_length,  #The padded size
                           trainable = False)  #freazing the weights 

Loading Vectors
...............


1193514it [01:30, 13238.25it/s]


Loaded  1193514  words vectors'


### Model selection

This particular model was selected as it produce the best F1 and accuracy score.

In [6]:
#Preparing the model
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(256,
               dropout = 0.5, 
               recurrent_dropout = 0.5,
               return_sequences = True)))
#Second Layer
model.add(Bidirectional(LSTM(128,
               dropout = 0.5,
               recurrent_dropout = 0.5 , return_sequences = False)))

model.add(Dense(4,activation = 'softmax'))
model.compile(optimizer = 'adam', 
               loss = 'categorical_crossentropy', 
               metrics = ['acc'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 82, 200)           9171400   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 82, 512)           935936    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               656384    
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 1028      
Total params: 10,764,748
Trainable params: 1,593,348
Non-trainable params: 9,171,400
_________________________________________________________________


In [7]:
history = model.fit(xtrain, ytrain,
                    validation_split = 0.15,
                    epochs = 42,
                        batch_size = 256,
                        verbose = 2)

Train on 37333 samples, validate on 6589 samples
Epoch 1/42
 - 1255s - loss: 1.0844 - acc: 0.5243 - val_loss: 0.9349 - val_acc: 0.6040
Epoch 2/42
 - 1432s - loss: 0.9883 - acc: 0.5790 - val_loss: 0.9115 - val_acc: 0.6134
Epoch 3/42
 - 1333s - loss: 0.9530 - acc: 0.5986 - val_loss: 0.8774 - val_acc: 0.6348
Epoch 4/42
 - 1463s - loss: 0.9217 - acc: 0.6179 - val_loss: 0.8619 - val_acc: 0.6441
Epoch 5/42
 - 1294s - loss: 0.8973 - acc: 0.6296 - val_loss: 0.8381 - val_acc: 0.6534
Epoch 6/42
 - 1265s - loss: 0.8821 - acc: 0.6389 - val_loss: 0.8378 - val_acc: 0.6555
Epoch 7/42
 - 1260s - loss: 0.8629 - acc: 0.6487 - val_loss: 0.8178 - val_acc: 0.6613
Epoch 8/42
 - 1257s - loss: 0.8480 - acc: 0.6527 - val_loss: 0.8043 - val_acc: 0.6707
Epoch 9/42
 - 1272s - loss: 0.8339 - acc: 0.6642 - val_loss: 0.8003 - val_acc: 0.6699
Epoch 10/42
 - 1248s - loss: 0.8259 - acc: 0.6629 - val_loss: 0.7920 - val_acc: 0.6746
Epoch 11/42
 - 1279s - loss: 0.8130 - acc: 0.6688 - val_loss: 0.7956 - val_acc: 0.6775
Epo

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 82, 200)           9171400   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 82, 512)           935936    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               656384    
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 1028      
Total params: 10,764,748
Trainable params: 1,593,348
Non-trainable params: 9,171,400
_________________________________________________________________


# Setting up  Streaming


In [8]:
import json
import tweepy
from textblob import TextBlob
from elasticsearch import Elasticsearch
import emoji
import itertools 
from datetime import datetime

import pickle
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.dicts.noslang import slangdict
from geopy import geocoders  
from urllib3.exceptions import ProtocolError


In [10]:
#Tweepy Set up
#setting up the consumer's keys
consumer_key = "Twitter is very strict and doesnt let me share this"
consumer_secret = "Twitter is very strict and doesnt let me share this"
access_token = "Twitter is very strict and doesnt let me share this"
access_token_secret = "Twitter is very strict and doesnt let me share this"

#setting up my authontication
auth = tweepy.OAuthHandler(consumer_key , consumer_secret)
auth.set_access_token(access_token , access_token_secret)
api = tweepy.API(auth , wait_on_rate_limit = True)

#Opening geopy connection to get the geolocation
gn = geocoders.GeoNames(username = 'filtheo')

#running bin/elasticsearch to have it ready
#opening localhost:9200
#Setting up elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])


### Pre-processing pipeline

Preparing the same pipeline used to clean the tweets used for training.

Before feeding a newly arrived tweet into the trained model, cleaning it the same way the training set was cleaned is vital

In [103]:
#Setting up pre-processing functions
pkl_file = open('extra1.p', 'rb') # => https://github.com/charlesmalafosse/FastText-sentiment-analysis-for-tweets/blob/master/betsentiment_sentiment_analysis_fasttext.py
extra = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('slang.p', 'rb') # => http://pydoc.net/ekphrasis/0.4.7/ekphrasis.dicts.noslang.slangdict/
slang = pickle.load(pkl_file)
pkl_file.close()

#Ekphprasis pipeline!

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize = ['url', 'email', 'percent', 'money', 'phone', 'user','time', 'url', 'date', 'number'],
    
     # terms that will be annotated =>flagged
    #I can add "hashtag" here too => I removed it because i dont know what to do
    annotate = {"allcaps", "elongated", "repeated",'emphasis', 'censored','hashtags'},
    fix_html = True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used for word segmentation 
    segmenter = "twitter", 
    
    # corpus from which the word statistics are going to be usedfor spell correction
    corrector = "twitter", 
    
    unpack_hashtags = True,  # perform word segmentation on hashtags <-removes the hashtag symbol an treats it as a word
    unpack_contractions = True,  # Unpack contractions (can't -> can not)
    spell_correct_elong = False,  # spell correction for elongated words
    
    
    
    #Tokenizes and then rejoins while getting rid of some terms
    
    #Set hashtags to true to keep hashtags 
    #I can set it to keep other stuff too:
    #See documentation: https://github.com/cbaziotis/ekphrasis/blob/master/ekphrasis/classes/tokenizer.py
    #On kwargs
    tokenizer = SocialTokenizer(lowercase = True , hashtags = True , emojis = True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. =>slang is a dic created and saved as pickle
    #documentation for dictionaries : http://pydoc.net/ekphrasis/0.4.7/ekphrasis.dicts.emoticons/
    dicts = [emoticons]
)

#replace repeating <user> and <url>
def repeated(tweet):
    if ('<user>' not in tweet) & ('<url>' not in tweet):
        return(tweet)
    else:
        cleaned_words = [word for word , zzzz in itertools.groupby(tweet.split())]
        return(" ".join(cleaned_words))

#Get geolocation
def get_geopoint(city):
      try:
        latitude = gn.geocode(city).latitude
        longitude = gn.geocode(city).longitude
        location = str(latitude)+","+str(longitude)
        return(location)
      except:
        return(None)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [104]:
#Preprocessing function
def pre_process(tweet):
  #Replacing panchuations and words
    tweet = tweet.encode('latin1','ignore').decode('utf-8','ignore')
    tweet = tweet.replace("’","'")#for some words
    tweet = tweet.replace("‘","'") #for some words
    tweet = tweet.replace('"','')
    tweet = tweet.replace("'","")
    tweet = tweet.replace('\n','..') #for newlines
    tweet = tweet.replace('&','and') #reducing pancuations
    tweet = tweet.replace(',','') #Gettind rid of commas

    #Demojiing
    tweet = emoji.demojize(tweet, delimiters=(" ", " "))

    #Stemming words from the dictionaries and replacing twitter Slang
    reformed = [extra[word] if word in extra else word for word in tweet.split()]
    tweet = " ".join(reformed)
    #For slang:
    reformed = [slang[word] if word in slang else word for word in tweet.split()]
    tweet = " ".join(reformed)
    #Ekprhasis pipeline
    tweet = " ".join(text_processor.pre_process_doc(tweet))
    #replacing consecutive <urls>
    #Adding it into an array so i can prepare it for my model
    tweet = [repeated(tweet)]
    #Tokenizing
    tweet_in_seq = tk.texts_to_sequences(tweet)
    #Padding
    tweet_seq_padded = pad_sequences(tweet_in_seq , maxlen = tweet_length)
    return(tweet_seq_padded)

#Return the emotion
def emotion_decoder(prediction):
    value = np.argmax(prediction[0])
    return(value)

### Initializing an ElasticSearch Index

In [113]:
#Creating the mapping for the elastic search index
settings = {                   
    "settings": {                        #<---Basic settings,slide 25
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {                       #<--maping is equal to schema! slide 28
        "properties": {                 #<- starting defying schema
                "Tweet": {             #<--text
                    "type": "text"
                },
                "Location": {               #<--for location
                    "type": "geo_point"     #<- defying geo_point so i can add it to the map
                },
                "Date": {                  #<-- for date
                    "type": "date"        #<- type date for line plot
                },
                "Emotion": {             #<-- predicted emotion
                    "type": "float"     #<- had a problem with int so i use float
                },
            }
        }
    }

#Creating a new index here
es.indices.create(index = 'final_final_test' , body = settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'final_final_test'}

### Streaming Pipeline

* Reading a new tweet
* Applying the pre-processing function to get the cleaned tweet
* Feeding the cleaned tweet into the model and getting the prediction
* Getting the geo-location of the author
* Saving the cleaned tweet, the emotion , the geolocation and the date to the initialized index

**In case an error occurs, the pipeline jumps to the next tweet**

In [None]:
#Setting up the Streaming
class TweetStreamListener(tweepy.StreamListener):

    # on success
    def on_data(self, data):
        try:

            # Getting The json
            dict_data = json.loads(data)
            #Getting the text
            text = dict_data["text"]
            #Pre-process tweet
            tweet = pre_process(text) #returns a sequance ready for the model
            #pass tweet into my model
            predicted = model.predict(tweet)
            #Decode the predicted label
            emotion = emotion_decoder(predicted).astype(float)
            #Get the Date
            date_created = str(dict_data['created_at'])
            formatted_date = datetime.strptime(date_created, '%a %b %d %H:%M:%S %z %Y')
            #Get geolocation
            geo = get_geopoint(dict_data['user']['location'])
            #Add data to elasticsearch index
            es.index(index = 'final_final_test',
                  body = {"Tweet": text,
                        "Location": geo,
                       "Date": formatted_date, #that must be date type
                       "Emotion": emotion, #that must be number type
                       })
            print('Tweet Approved,Writting Index')
            return True
        
        
        except:  #I got weird errors for specific tweets with different encoding
            print('Error,nothing Written!')

    # on failure
    def on_error(self, status):
        print (status)
        return(True)

if __name__ == '__main__':

    # create instance of the tweepy tweet stream listener
    listener = TweetStreamListener()

   # create instance of the tweepy stream
    stream = tweepy.Stream(auth, listener)
    while True: 
        try:
            stream.filter(track=['#coronavirus'],stall_warnings=True)   # search twitter for "#coronavirus"

        except (ProtocolError, AttributeError): #for some connection issues
            continue

  


Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Approved,Writting Index
Tweet Appr

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt

