# Twitter Sentiment Analysis

In [4]:
import pandas as pd
import re
import numpy as np
import pickle

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense,Embedding, Dropout, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow import keras
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import gensim

VECTOR_SIZE = 300

In [2]:
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def clean_tweet_text(text):
    return re.sub(r"[^\w\d'\s]+",' ', re.sub("@[^\s]+|https?:\S+|http?:\S", ' ', str(text).lower()))

## Training

In [None]:
t = pd.read_csv(".training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", header=None)
t=t[[0,5]]
t.columns = ['sentiment','text']
t.sentiment = t.sentiment/4
t.sentiment = t.sentiment.astype(int)

t['clean_text'] = t.text.apply(clean_tweet_text)

In [23]:
df_train, df_test = train_test_split(t, test_size=0.2, random_state=10)

df_train['lemma_text'] = df_train.clean_text.apply(lemmatize_sentence)
df_test['lemma_text'] = df_test.clean_text.apply(lemmatize_sentence)
df_train =df_train[~df_train['lemma_text'].isna()]
df_test =df_test[~df_test['lemma_text'].isna()]
documents = [str(_text).split() for _text in df_train['lemma_text']] 

In [12]:
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=VECTOR_SIZE, window=7, min_count=10, workers=8)
w2v_model.build_vocab(documents)
w2v_model.train(documents, total_examples=len(documents), epochs=16)

(196430169, 270212416)

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.lemma_text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.lemma_text), maxlen=140)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.lemma_text), maxlen=140)
y_train=df_train.sentiment
y_test=df_test.sentiment

Total words 243267


In [19]:
embedding_matrix = np.zeros((vocab_size, VECTOR_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)
embedding_layer = Embedding(vocab_size, VECTOR_SIZE, weights=[embedding_matrix], input_length=140, trainable=False)

(243267, 300)


In [20]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=3,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 140, 300)          72980100  
_________________________________________________________________
dropout (Dropout)            (None, 140, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 73,140,601
Trainable params: 160,501
Non-trainable params: 72,980,100
_________________________________________________________________
Train on 1149769 samples, validate on 127753 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [40]:
model.save('nlp.model')
pickle.dump(tokenizer, open('word_tokenizer.pkl', "wb"), protocol=0)

In [21]:
score = model.evaluate(x_test, y_test, batch_size=1024)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.8156277105758483
LOSS: 0.40787553447653907


## Use

In [9]:
import pandas as pd
import re
import pickle

from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  

    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def clean_tweet_text(text):
    return re.sub(r"[^\w\d'\s]+",' ', re.sub("@[^\s]+|https?:\S+|http?:\S", ' ', str(text).lower()))

In [6]:
model=keras.models.load_model('nlp.model')
with open('word_tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
def predict_sentiment(text):
    clean = clean_tweet_text(text)
    lemma = lemmatize_sentence(clean)
    score = model.predict([pad_sequences(tokenizer.texts_to_sequences([lemma]), maxlen=140)])[0][0]
    senti = 'Postive' if score >=0.5 else 'Negative'
    return (score, senti)

In [8]:
text = 'I am happy'

predict_sentiment(text)

(0.9897983, 'Postive')

## Prepacked Libraries

In [25]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
print(sid.polarity_scores(text)['compound'])
from textblob import TextBlob
print(TextBlob(text).polarity)

-0.4585
-0.4
