# Importing libraries

In [None]:
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import re
import nltk
from contraction import CONTRACTION_MAP     # Its a py file contain expanded word of all short words like I'm
from bs4 import BeautifulSoup          

In [None]:
# Loading the dataset

# Download Sentiment 140 dataset from https://www.kaggle.com/kazanova/sentiment140
data = pd.read_csv("data_real.csv")
data.head()
data = data.replace(4,1)

# Data cleaning

In [None]:
# Data cleaning funstions

def remove_htmltags(text):                    # Remove HTML tags
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

def remove_accented_chars(text):             # Normalizing accented charaters like ü
    import unicodedata
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP): # Expanding short words iike I've --> I have
    from contraction import CONTRACTION_MAP
    import contraction
    import re
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def remove_special_characters(text, remove_digits=False):              # Remove special characters
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def simple_stemmer(text):                                             # Stemming the words
    import nltk
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def simple_lemmatize(text):                                          # lammetizing the words
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer() 
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

def remove_stopwords(text, is_lower_case=False):                     # Remove stopwords
    from nltk.corpus import stopwords
    from nltk.tokenize import WordPunctTokenizer
    tokenizer = WordPunctTokenizer()
    stopword_list =stopwords.words('english')
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def remove_hash_attherate(text):                                         # Remove @ and # tags
    text = re.sub("#\w*", "",text)
    text = re.sub("@\w*", "",text)
    text = re.sub("\s+", " ", text)
    return text

# Compiling all text cleaning function

def noramalize_text(document,htmltags = True, accented_chars = True, contractions_exp = True,
                   text_lower_case = True,special_characters = True, stemmer_text = True, 
                   lemmatize_text = True, stopwords_remove = False, remove_hash = True):
    
    normalized_doc = []
    
    for text in document:
        if htmltags:
            text = remove_htmltags(text)
        
        if accented_chars:
            text = remove_accented_chars(text)
        
        if contractions_exp:
            text = expand_contractions(text)
        
        if text_lower_case:
            text = text.lower()
        # remove extra line
        text = re.sub(r'[\r|\n|\r\n]+', ' ',text)
        
        if remove_hash:
            text = remove_hash_attherate(text)
            
        if special_characters:
            text = remove_special_characters(text)
            
        if stemmer_text:
            text = simple_stemmer(text)
        
        if lemmatize_text:
            text = simple_lemmatize(text)
        
        # remove extra whitespace
        text = re.sub(' +', ' ', text)   
        
        if stopwords_remove:
            text = remove_stopwords(text)
            
        normalized_doc.append(text)    
        
    return normalized_doc

In [None]:
# Creating a column with cleaned tweets 

data["cleaned"] = noramalize_text(data['tweets'],stemmer_text = False)
data.head()

In [None]:
# Saving it to csv for fast load

data.to_csv("cleaned.csv", index = None)

In [None]:
# Loding cleaned csv

data = pd.read_csv("cleaned.csv")

In [None]:
data.head()

In [None]:
#Typecasting as string

text = data["cleaned"].astype(str)

# Tokenizing the text using keras preprocessing library

In [13]:
MAX_LEN =50

def Tokenizing(data, tokenize_label = "", label = ""):
    # Spilitting in test and test data
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(data, test_size=0.2)
    
    # Tokenizing using keras preprocessing library
    from keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    sequences_train = tokenizer.texts_to_sequences(train["tweets"])
    
    sequences_test = tokenizer.texts_to_sequences(test["tweets"])
    
    vocab_size = len(tokenizer.word_index) + 1
    print(vocab_size)
    Y_train = train["labels"]
    Y_test = test["labels"]
    X_train = tf.keras.preprocessing.sequence.pad_sequences(sequences_train,
                                                           value = 0,
                                                           padding = 'post',
                                                           maxlen = MAX_LEN)
    X_test = tf.keras.preprocessing.sequence.pad_sequences(sequences_test,
                                                           value = 0,
                                                           padding = 'post',
                                                           maxlen = MAX_LEN)
    
    return X_test, X_train, Y_train, Y_test, tokenizer, vocab_size

In [14]:
X_test, X_train, Y_train, Y_test,tokenizer, vocab_size = Tokenizing(data, "tweets", "labels")

469332


# Saving the tokenizer into pickle for future use

In [15]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Creating the CNN 1D Model

In [17]:
embedding_dims = 100
nb_filters = 256
FFN_units = 512
nb_classes = 2
dropout_rate = 0.1
Batch_size = 64
kernel_size_first  = 2 # First one
# Creating the CNN 

def Model_Conv():
    from keras.models import Sequential
    from keras.layers import Conv1D, GlobalMaxPool1D, Dropout, Embedding, Dense,Input,MaxPooling1D, GlobalMaxPooling1D,Concatenate
    from keras.models import Model
    
    model = Sequential()

    text_input_layer = Input(shape=(MAX_LEN,))
    embedding = Embedding(vocab_size, embedding_dims)(text_input_layer)
    conv1 = Conv1D(filters = nb_filters,kernel_size = kernel_size_first,padding = "valid",
                    activation = "relu")(embedding) 
    pool1 = GlobalMaxPooling1D()(conv1)
    conv2 = Conv1D(filters = nb_filters,kernel_size = kernel_size_first+1,padding = "valid",
                    activation = "relu")(embedding)
    pool2  = GlobalMaxPooling1D()(conv2)
    conv3 = Conv1D(nb_filters,kernel_size = kernel_size_first+2,padding = "valid",
                activation = "relu")((embedding))
    pool3 = GlobalMaxPooling1D()(conv3)
    cat_conv = Concatenate(axis=1)([pool1,pool2,pool3])
    dense1 = Dense(units = FFN_units, activation = "relu" )(cat_conv)
    drop = Dropout(rate = dropout_rate )(dense1)
    last = Dense(units = 1, activation = "sigmoid")(drop)
    mod  = Model(text_input_layer, last)
    
    return mod

In [18]:
mod  = Model_Conv()

In [19]:
mod.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 100)      46933200    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 49, 256)      51456       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 48, 256)      77056       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

# Compiling and training the model

In [20]:
mod.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [21]:
mod.fit(X_train, Y_train, batch_size =Batch_size, epochs = 2 , verbose = 1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x26a12604400>

# Saving and evaluating the model

In [22]:
mod.save("model.h5")

In [23]:
mod.evaluate(X_test ,Y_test, batch_size =Batch_size )



[0.4028938247174025, 0.81696875]

# Loading the tokenizer pickle and test it on any sentence

In [26]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [42]:
text = ["This movie is not so good", "This suppose to be bad but it is good"]

In [43]:
sequences_test = tokenizer.texts_to_sequences(text)
test = tf.keras.preprocessing.sequence.pad_sequences(sequences_test,value = 0,padding = 'post', maxlen = MAX_LEN)

In [44]:
pred = mod.predict(test)

In [45]:
sentiment_list = []

for i in range(0,pred.shape[0]):
    if pred[i]>0.5:
        value = "Positive"
    if pred[i]<=0.5:
        value = "Negative"
    sentiment_list.append(value)    

In [46]:
sentiment_list

['Negative', 'Positive']