## Twitter Classification
Applying Natural Language Processing to classify if a tweet is about a real disaster or not

Kindly find the required Glove Embedding text file (glove.6B.100d.txt) from:
https://nlp.stanford.edu/projects/glove/


Created by: Brandon Spiteri

### Library Import

In [1]:
# Import necessary packages
import re
from os.path import join
import numpy as np
from sklearn.model_selection import GridSearchCV 
from sklearn.naive_bayes import MultinomialNB
from sklearn import naive_bayes
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
import pickle

from keras.layers import Dense, Dropout, Embedding, LSTM

from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

from keras.preprocessing.sequence import pad_sequences

from keras.preprocessing.text import Tokenizer

import numpy as np

import gensim

from sklearn.linear_model import LogisticRegression

#seed keras model 
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(2)

from keras.models import Sequential
from keras.models import load_model

from nltk.corpus import stopwords
import csv 

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.callbacks import EarlyStopping


from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score



### Tweet preprocessing

In [2]:

def preprocess(filename,output = False):
    '''
    filename - csv file to process, clean and tokenise
    Preprocess function reads the csv file, extracts the tweet id, keyword, sentiment and message. 
    Regex functions are applied to clean the tweets.
    The content is tokenised.
    '''
    clean_content_tokenized = []
    tweet_sentiment =[]
    tweet_id_list =[]
    documents = []
    tweet_keyword_list=[]

    
    with open(filename,'r', encoding="utf8") as f:
        next(f)

        tweet = csv.reader(f)
        for row in tweet: 

            # store 1st col of tweet as tweet_id
            tweet_id = int(row [0])
            tweet_id_list.append(tweet_id)

            #store 2nd col as keyword
            tweet_keyword = row[1]
            tweet_keyword_list.append(tweet_keyword) 

            if output ==True:
                # second col as sentiment (pos/neg/neutral)
                sentiment = int(row[4])
                tweet_sentiment.append(sentiment)


            #4th column as output
            tweet_message = row[3]

            # @user
            content = re.sub(r"@[A-Za-z0-9_]+", " USERNAME ", tweet_message.lower())
            # URL link
            content = re.sub(r"http\S+", " URLLINK ", content)
            #Remove repeater letter in a word e.g. heeeello to hello
            content = re.sub(r"([A-Za-z])\1{2,}", r"\1", content)
            #Replace all whitespace characters 
            content = re.sub(r"\s", " ", content)
            # Replace EOS with END
            content = re.sub(r"(\.|!|\?) ", " END ", content)
            # Remove non-alphanumeric characters except spaces
            content = re.sub(r"[^A-Za-z0-9 ]", "", content) 
            #Remove Pure Digits
            content = re.sub(r"\bd+\b", "", content)
            #Remove Single Letter words eg "a"
            content = re.sub(r"\b[a-z0-9]\b", "", content)
            #tokenize content
            document =  nltk.word_tokenize(content)
            clean_content_tokenized.append(document)
            
    return tweet_id_list,tweet_keyword_list, clean_content_tokenized, tweet_sentiment 


In [3]:
stopwords = stopwords.words('english')

def treebank_pos(word_tag):
    #Return TREEBANK TAG Part-of-speech tag
    if word_tag.startswith('V'): #verb
        return 'v'
    elif word_tag.startswith('N'): #noun 
        return 'n'
    elif  word_tag.startswith('J'): #adjective
        return 'a'
    elif word_tag.startswith('R'): #adverb
        return 'r'
    else:
        #set to noun if none is satistfied 
        return 'n'
    
def lemmatize_content (content):
    '''
    Transform words into their root from by using lemmisation with Treebank POS tagging
    '''
    content_lemmatized = []
    content_pos=[]
    content_clean=[]
    
    #lemmatize using POS tag
    lemmatizer = nltk.stem.WordNetLemmatizer()

    #POS tagging
    for tweet in content:
        #assign POS to each word 
        temp_pos = nltk.pos_tag(tweet)
        content_pos.append(temp_pos)  
        
    for tweet in content_pos:
        #Perform lemmatization on each POS word and return the lemmatized word as list
        temp_lem = [ lemmatizer.lemmatize(word[0], pos=treebank_pos(word[1])) for word in tweet]
        #Remove stop words
        temp_lem = [word for word in temp_lem if word not in stopwords] 
        content_lemmatized.append(temp_lem) 

    for tweet in content_lemmatized:
        #Concatenate content 
        temp_concat_content = " ".join(tweet)
        content_clean.append(temp_concat_content)
        
    return content_clean, content_lemmatized

In [4]:
def preprocess_keyword(keyword):
    '''
    Clean twitter keyword
    Use porterstemmer to reduce keywords to their root form
    '''
    keyword_clean=[]
    for key in keyword:
        keyword_clean.append(re.sub(r"%20", " ", key.lower()))
    
    ps = PorterStemmer()
    
    stemmed_key =[]
    for w in keyword_clean:
        stemmed_key.append(ps.stem(w))
    return stemmed_key

### Preprocess Tweets
Preprocess tweets by extracting tweet id, keyword, twitter content and sentiment from csv file
Clean tweets and tokenise content. 

In [5]:
preprocessed_tweets = {}
preprocessed_tokenized_tweets = {}
tweetgts = {} 
tweetids = {}
tweetkey = {}

tweetids['train'], tweetkey['train'], clean_content_tokenized, tweetgts['train']  = preprocess('train.csv', True)

preprocessed_tweets['train'], preprocessed_tokenized_tweets['train'] = lemmatize_content(clean_content_tokenized)
                        
        

In [6]:
#stemm keyword so that we try and reduced keywords to their rootform
tweetkey['train'] = preprocess_keyword(tweetkey['train'])

In [7]:
#output classes are almost balanced
np.unique(tweetgts['train'], return_counts = True)



(array([0, 1]), array([4342, 3271], dtype=int64))

###  Feature Function Declaration

In [8]:
#One Hot Encode Keyword
enc = OneHotEncoder(handle_unknown='ignore')
keyword_pd = pd.DataFrame(data = tweetkey['train'])
encoded_key = enc.fit_transform(keyword_pd)


In [9]:
#Feature processing using TFIDF
# word level tf-idf
def tfidf_word_level(content_train):
    #Vectorize content by word with 2000 max features capping
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S+', max_features= 2000)
    tfidf_vect.fit(content_train)
    xtrain_tfidf =  tfidf_vect.transform(content_train)

    xtrain_tfidf_np = xtrain_tfidf.todense()
    xtrain_tfidf_np = np.array(xtrain_tfidf_np)

    #return train set and vectorizer
    return (xtrain_tfidf_np, tfidf_vect)

# ngram level tf-idf
def tfidf_ngram_level(content_train):
    #Perform Bigram and Trigram Vectorization with max features of 5000
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S+', ngram_range=(2,3), max_features=5000) #non whitespace chars
    tfidf_vect_ngram.fit(content_train)
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(content_train)
    
    xtrain_tfidf_ngram_np = xtrain_tfidf_ngram.todense()
    xtrain_tfidf_ngram_np= np.array(xtrain_tfidf_ngram_np)
    
    #return train set and vectorizer
    return (xtrain_tfidf_ngram_np, tfidf_vect_ngram)


#LSTM modelling
def glove_LSTM_model(content_train, Y_train, no_tokens , embedding_dim):

    # tokenize input content
    tk = Tokenizer(num_words=no_tokens)
    tk.fit_on_texts(content_train)
    #convert text to sequence
    content_seq = tk.texts_to_sequences(content_train)
    
    #Build Glove Dictionary
    glove_file = 'glove.6B.100d.txt'
    embedding_dict = {}
    with open(glove_file,'r', encoding="utf8") as glove:
        for line in glove:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embedding_dict[word] = vector

    #zero padding numpy array 
    embedding_matrix = np.zeros((no_tokens, embedding_dim))
    
    #retrieve 100 dimensional vector for each word
    for word, index in tk.word_index.items():
        if index < no_tokens:
            vect = embedding_dict.get(word)
            if vect is not None:
                embedding_matrix[index] = vect
        else:
            break
    
    #get max train length
    max_length = np.max([len(text.split()) for text in content_train])
    #pad sequence
    content_seq_trunc = pad_sequences(content_seq, maxlen=max_length)
    
    #encoding output
    encoder = LabelEncoder()
    y_train_encoded = encoder.fit_transform(Y_train)
    y_train_categorical = to_categorical(y_train_encoded)
    
    earlystopping=EarlyStopping(monitor="val_loss", patience=25, verbose=2, mode='auto', restore_best_weights=True)

    #train model
    model = Sequential()
    model.add(Embedding(no_tokens, embedding_dim, weights = [embedding_matrix], trainable=False, input_length=max_length ))
    model.add((LSTM(80, return_sequences=True)))
    model.add(Dropout(0.2))
    model.add((LSTM(32)))
    model.add(Dropout(0.2))
    model.add(Dense(units=2, activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
    History = model.fit(content_seq_trunc,y_train_categorical,epochs = 100,batch_size=20,validation_split =0.2, callbacks=[earlystopping])
    
    return tk, max_length, model

### Dataset Preperation

In [10]:
#extract features: word and ngram level tf-idf
(xtrain_tfidf_np, tfidf_word_transformer) = tfidf_word_level(preprocessed_tweets['train'])        
(xtrain_tfidf_ngram_np, tfidf_ngram_transformer) = tfidf_ngram_level(preprocessed_tweets['train'])

In [11]:
#concatenate features
xtrain_concat_features = np.concatenate((xtrain_tfidf_np, xtrain_tfidf_ngram_np,encoded_key.todense()), axis=1)

### Train Test Split

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(xtrain_concat_features, tweetgts['train'], test_size=0.2,shuffle=True)


In [13]:
indices = np.arange(len(tweetgts['train']))

In [14]:
id_train, id_valid, y_train, y_valid = train_test_split(indices, tweetgts['train'], test_size=0.2,shuffle=True)
X_train_first_classifiers = xtrain_concat_features[id_train ,  :]
X_valid_first_classifiers = xtrain_concat_features[id_valid ,  :]


preprocessed_tweets_np = np.array(preprocessed_tweets['train'])
X_train_lstm_classifier = preprocessed_tweets_np[id_train]
X_valid_lstm_classifier = preprocessed_tweets_np[id_valid]

### Model Training

### MultinomialNB Classifier

In [15]:
# Train sentiment Classifier 1 
params = {'alpha': [0.001,0.01,0.1,1,10,50], 'fit_prior':[True,False]}
model_nb_grid = GridSearchCV(naive_bayes.MultinomialNB(), params, n_jobs=-1, verbose=2, cv=3)
model_nb_grid.fit(X_train_first_classifiers,y_train)
Y_predicted_model_nb = model_nb_grid.predict(X_valid_first_classifiers)
accuracy_score(Y_predicted_model_nb, y_valid)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


0.7977675640183848

### LogisticRegression Classifier

In [16]:
# Train sentiment Classifier 2
params = {'C': [0.001,0.01,0.1,1,10,50]}
model_me_grid = GridSearchCV(LogisticRegression(), params, n_jobs=-1, verbose=-1, cv=3)
model_me_grid.fit(X_train_first_classifiers,y_train)
Y_predicted_model_me = model_me_grid.predict(X_valid_first_classifiers)
accuracy_score(Y_predicted_model_me, y_valid)

0.7931713722915299

### LSTM Classifier

In [17]:
# Train sentiment Classifier 3
# Training an LSTM model with early stopping and Glove word embedding

no_tokens = 5000  # Max no of tokens
embedding_dim = 100  # Embedding dimensionality

#fit model
tokenizer_lstm, max_length_lstm, lstm_model = glove_LSTM_model(X_train_lstm_classifier.tolist(),y_train, no_tokens , embedding_dim)

#tokenize and pad test tweets
xvalid_seq = tokenizer_lstm.texts_to_sequences(X_valid_lstm_classifier.tolist())
xvalid_seq_trunc = pad_sequences(xvalid_seq, maxlen=max_length_lstm)

predict_lstm_categorical = lstm_model.predict(xvalid_seq_trunc)
Y_predicted_lstm = np.argmax(predict_lstm_categorical, axis=1)
accuracy_score(Y_predicted_lstm, y_valid)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 00027: early stopping


0.7859487852921865

### Model Evaluation
From the following results, it can be concluded that with a 20% validation split on the labelled training set, all models perform almost equally well. Logistic Regression turned out to be the top perfomer.

The Multinomial Naive Bayes and the Logistic Regression models have the tweet encoded keyword, the word level and the ngram TF-IDF as input features. 
The LSTM model is based on the Glove 100D Embedding as input feature. 


In [18]:
#create dataframe for results storage
results = pd.DataFrame(columns=['Accuracy','F1-Micro', 'F1-Macro'])

In [19]:
def compute_metrics(act, pred):    
    f1_micro= f1_score(act, pred,average='micro')
    f1_macro = f1_score(act, pred,average='macro')
    accuracy= accuracy_score(act, pred)
    return accuracy, f1_micro, f1_macro

In [20]:
#MultinomialNB Results
accuracy, f1_micro, f1_macro = compute_metrics(Y_predicted_model_nb, y_valid)
row = pd.Series({'Accuracy': accuracy, 'F1-Micro': f1_micro, 'F1-Macro': f1_macro},name='MultinomialNB')
results = results.append(row)

#LogisticRegression Results
accuracy, f1_micro, f1_macro = compute_metrics(Y_predicted_model_me, y_valid)
row = pd.Series({'Accuracy': accuracy, 'F1-Micro': f1_micro, 'F1-Macro': f1_macro},name='LogisticRegression')
results = results.append(row)

#LSTM Results
accuracy, f1_micro, f1_macro = compute_metrics(Y_predicted_lstm, y_valid)
row = pd.Series({'Accuracy': accuracy, 'F1-Micro': f1_micro, 'F1-Macro': f1_macro},name='LSTM')
results = results.append(row)


In [21]:
results

Unnamed: 0,Accuracy,F1-Micro,F1-Macro
MultinomialNB,0.797768,0.797768,0.787868
LogisticRegression,0.793171,0.793171,0.784938
LSTM,0.785949,0.785949,0.768959


### Test Prediction

In [22]:
tweetids['test'], tweetkey['test'], clean_content_tokenized_test, tweetgts['test'],  = preprocess('test.csv', False)

preprocessed_tweets['test'], preprocessed_tokenized_tweets['test'] = lemmatize_content(clean_content_tokenized_test)
    

In [23]:
#stemm keyword so that we try and reduced keywords to their rootform
tweetkey['test'] = preprocess_keyword(tweetkey['test'])

In [24]:
#One Hot Encode Keyword
keyword_pd_test = pd.DataFrame(data = tweetkey['test'])
encoded_key_test = enc.fit_transform(keyword_pd_test)


In [25]:
#extract features: word and ngram level tf-idf
(xtest_tfidf_np, tfidf_word_transformer) = tfidf_word_level(preprocessed_tweets['test'])        
(xtest_tfidf_ngram_np, tfidf_ngram_transformer) = tfidf_ngram_level(preprocessed_tweets['test'])
#concatenate features
xtest_concat_features = np.concatenate((xtest_tfidf_np, xtest_tfidf_ngram_np,encoded_key_test.todense()), axis=1)

In [26]:
#Perform Predictions

#MultinomialNB
Y_predicted_MultinomialNB_test = model_nb_grid.predict(xtest_concat_features)

#LogisticRegression
Y_predicted_LogisticRegression_test = model_me_grid.predict(xtest_concat_features)

#LSTM
preprocessed_tweets_np_test = np.array(preprocessed_tweets['test'])
#tokenize and pad test tweets
xvalid_seq = tokenizer_lstm.texts_to_sequences(preprocessed_tweets_np_test.tolist())
xvalid_seq_trunc = pad_sequences(xvalid_seq, maxlen=max_length_lstm)

predict_lstm_categorical = lstm_model.predict(xvalid_seq_trunc)
Y_predicted_lstm_test = np.argmax(predict_lstm_categorical, axis=1)

In [28]:
#Export To Files
np.savetxt('MultiNomianlNB_test.csv', Y_predicted_MultinomialNB_test, delimiter=',', fmt='%d')
np.savetxt('LogisticRegression_test.csv', Y_predicted_LogisticRegression_test, delimiter=',', fmt='%d')
np.savetxt('LSTM_test.csv', Y_predicted_lstm_test, delimiter=',', fmt='%d')