# Tweets classification using words embedding and LSTM 

In [25]:
# Import libraries

import re
import os

import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, concatenate, Dropout, concatenate,Input
from tensorflow.keras.layers import Bidirectional

import nltk
import string
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yassine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yassine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Read data 
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...


## Preprocessing tweets, preparing embedding matrices, creating and training LSTM model

In [87]:
class Classifier:
    """
    A whole machine learning pipeline for tweets classification using word embeddings and LSTM
    """
    # init method
    def __init__(self, X: list, Y: list, embed_path: str, embed_dim: int, epochs=10, batch_size=256):
        
        self.X = X
        self.Y = Y
        self.embed_path = embed_path
        self.embed_dim = embed_dim
        self.epochs = epochs
        self.batch_size = batch_size
        
    def preprocess(self):
        
        # Split 
        X_train, X_test, Y_train, Y_test = train_test_split(
            self.X, self.Y, test_size=0.3, random_state=42)
        
        # Preprocecing the text
        X_train = [self.clean_text(text) for text in X_train]
        X_test = [self.clean_text(text) for text in X_test]
        Y_train = np.asarray(Y_train)
        Y_test = np.asarray(Y_test)
        
        # Tokenizing the text
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)
        self.tokenizer = tokenizer

        # Creating the embedding matrix
        embedding = Embeddings(self.embed_path, self.embed_dim)
        self.embedding_matrix = embedding.create_embedding_matrix(tokenizer, len(tokenizer.word_counts))

        # Creating the padded input for the deep learning model
        self.max_len = np.max([len(text.split()) for text in X_train])
        X_train = self.string_to_tensor(X_train, self.tokenizer, self.max_len)
        X_test = self.string_to_tensor(X_test, self.tokenizer, self.max_len)
        self.X_train, self.X_test, self.Y_train, self.Y_test = X_train, X_test, Y_train, Y_test
    
    # Train rnn model
    def train(self):
        
        model = self.Rnn_model(
            embedding_matrix=self.embedding_matrix, 
            embedding_dim=self.embed_dim, 
            max_len=self.max_len
        )
        model.fit(
            self.X_train,
            self.Y_train, 
            batch_size=self.batch_size, 
            epochs=self.epochs
        )
        self.model = model
        return self.model
    
    # Model architecture 
    def Rnn_model(self, embedding_matrix, embedding_dim, max_len):
        """
        Recurrent neural network. The embedding layer is supposed 
        to take an embedding matrix for pretrained weights
        """

        inp1 = Input(shape=(max_len,))
        x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
        x = Bidirectional(LSTM(256, return_sequences=True))(x)
        x = Bidirectional(LSTM(150))(x)
        x = Dense(128, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(1, activation="sigmoid")(x)    
        model = Model(inputs=inp1, outputs=x)

        model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
        return model
    
    def predict(self, text: list):
        
        text = [self.clean_text(t) for t in text]
        text = self.string_to_tensor(text, self.tokenizer, self.max_len)
        
        yhat = [x[0] for x in self.model.predict(text).tolist()]
        
        return [1 if x > 0.5 else 0 for x in yhat]
    
    def evaluate(self):
        
        # If X_test is provided we make predictions with the created model
        if len(self.X_test)>0:
            #X_test = [self.clean_text(text) for text in self.X_test]
            #X_test = self.string_to_tensor(self.X_test, self.tokenizer, self.max_len)
            yhat = [x[0] for x in self.model.predict(self.X_test).tolist()]
            
            self.yhat = yhat

            # If true labels are provided we calculate the accuracy of the model
            if len(self.Y_test)>0:
                self.acc = accuracy_score(self.Y_test, [1 if x > 0.5 else 0 for x in yhat])
                return self.acc
                
    # Embedding
    def get_coefs(self, word, *arr): 
        return word, np.asarray(arr, dtype='float32')

    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.embed_path, errors='ignore'))
        return embeddings_index

    def create_embedding_matrix(self, tokenizer, max_features):
        """
        A method to create the embedding matrix
        """
        model_embed = self.get_embedding_index()

        embedding_matrix = np.zeros((max_features + 1, self.embed_dim))
        for word, index in tokenizer.word_index.items():
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        return embedding_matrix

    def string_to_tensor(self, string_list: list, tokenizer, max_len) -> list:
        """
        A method to convert a string list to a tensor for a deep learning model
        """    
        string_list = tokenizer.texts_to_sequences(string_list)
        string_list = pad_sequences(string_list, maxlen=max_len)

        return string_list
    
    def clean_text(self, t: str) -> str:
        """
        A method to clean tweets from stopword, links and punctuations."
        """
        # Cleaning the urls
        t = re.sub(r'https?://\S+|www\.\S+', '', t)

        # Cleaning the html elements
        t = re.sub(r'<.*?>', '', t)

        # Removing the punctuations
        for x in t.lower(): 
            if x in string.punctuation: 
                t = t.replace(x, "") 

        # Converting the text to lower
        t = t.lower()

        # Removing stop words
        t = ' '.join([word for word in t.split() if word not in stopwords.words('english')])

        # Cleaning the whitespaces
        t = re.sub(r'\s+', ' ', t).strip()

        return t       

In [80]:
class_dict = {0: "Politics",
              1: "Sports"}

In [96]:
# Unprocessed tweets
X = data['TweetText'].tolist()

# labels : Sports = 1 and Politics = 0
class_dict = {0: "Politics",
              1: "Sports"}

Y = [list(class_dict.keys())[list(class_dict.values()).index(i)] for i in data['Label'].tolist()]

embed_path="glove.6B.300d.txt"
embed_dim=300

classifier = Classifier(X, Y, embed_path, embed_dim, epochs=6, batch_size=256)

In [97]:
# Prepare training data
classifier.preprocess()

In [98]:
# Start training 
model = classifier.train()

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [104]:
# Evaluate the model
acc = classifier.evaluate()

In [105]:
acc

0.9376915219611849

In [101]:
test_df = pd.read_csv("test.csv")
test_text = test_df["TweetText"].tolist()
pred_list = [pred for pred in classifier.predict(test_text)]
test_df["predicted_Lables"] = [class_dict[cls] for cls in pred_list]
test_df

Unnamed: 0,TweetId,TweetText,predicted_Lables
0,306486520121012224,'28. The home side threaten again through Maso...,Sports
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....,Sports
2,289531046037438464,'@Sochi2014 construction along the shores of t...,Politics
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...,Politics
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...,Sports
...,...,...,...
2605,282023761044189184,'Qualifier 1 and Eliminator games will be play...,Sports
2606,303879735006601216,"@reesedward Hi Edward, it's not a #peacekeepin...",Politics
2607,297956846046703616,'Perera was @SunRisersIPL first #IPL purchase ...,Sports
2608,304265049537658880,"'#SecKerry: Thanks to Senator @TimKaine, @RepR...",Politics


In [110]:
text = ["I'm pretty sure that EN-Nesiry will play in a better team in Europe.", "Trump lost the election.", 
        "Obama was better than trump", "Iran killed another country's president",]

df = pd.DataFrame(np.array(text), columns=["Text"])
pred_list = [pred for pred in classifier.predict(text)]
df["predicted_Lables"] = [class_dict[cls] for cls in pred_list]
df

Unnamed: 0,Text,predicted_Lables
0,I'm pretty sure that EN-Nesiry will play in a ...,Sports
1,Trump lost the election.,Politics
2,Obama was better than trump,Politics
3,Iran killed another country's president,Politics
