In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Flatten, SimpleRNN, LSTM, GRU, Bidirectional, GlobalMaxPool1D, Conv1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

import spacy
nlp = spacy.load('en_core_web_sm')

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')


# Data Cleaning

In [None]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin', header = None, on_bad_lines= 'skip')
df.rename(columns={0:'labels', 5:'tweets'}, inplace=True)
df.drop(columns=[1,2,3,4], inplace=True)

In [None]:
df.replace({4:1}, inplace=True)

In [None]:
df = df.sample(frac=1, random_state=42)
df = df.head(10000)
df['tweets'].values[1]

In [None]:
def char_count(tweets):
  return len(tweets)

def word_count(tweets):
  return len(tweets.split(' '))

def pre_process(tweets):
  tweets = tweets.lower()
  tweets = re.sub(r'http\S+ | www\S+ | https\S+', '', tweets, flags= re.MULTILINE)
  tweets = re.sub(r'@\w+', '', tweets)
  tweets = re.sub(r'^#\S+', '', tweets)
  tokens = [ token.lemma_ for token in nlp(tweets) if token.text not in stop_words and len(token) > 1]
  processed_tweets = ' '.join(tokens)
  return processed_tweets

df['clean_tweets'] = [pre_process(tweet) for tweet in tqdm(df['tweets'], position=0, leave=True)]
df['char_count'] = [char_count(tweet) for tweet in tqdm(df['clean_tweets'], position=0, leave=True)]
df['word_count'] = [word_count(tweet) for tweet in tqdm(df['clean_tweets'], position=0, leave=True)]

100%|██████████| 1600000/1600000 [3:03:57<00:00, 144.96it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 2731933.92it/s]
100%|██████████| 1600000/1600000 [00:01<00:00, 1421491.38it/s]


In [None]:
df.head(10)

Unnamed: 0,labels,tweets,clean_tweets,char_count,word_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww be bummer shoulda get david carr third day,47,9
1,0,is upset that he can't update his Facebook by ...,upset can not update facebook texte ... might ...,79,14
2,0,@Kenichan I dived many times for the ball. Man...,dive many time ball manage save 50 rest go bound,48,10
3,0,my whole body feels itchy and like its on fire,whole body feel itchy like fire,31,6
4,0,"@nationwideclass no, it's not behaving at all....",be behave be mad can not see,28,7
5,0,@Kwesidei not the whole crew,whole crew,10,2
6,0,Need a hug,need hug,8,2
7,0,@LOLTrish hey long time no see! Yes.. Rains a...,hey long time see yes .. rain bit bit lol be f...,58,14
8,0,@Tatiana_K nope they didn't have it,nope not,8,2
9,0,@twittera que me muera ?,que muera,9,2


# Visualizations

In [None]:
negative = df['labels'].value_counts()[0]
positive = df['labels'].value_counts()[1]

fig = px.pie(values=[positive, negative], title='Sentiment Distribution', names=['Positive', 'Negative'], hover_name=['Positive', 'Negative'], template='simple_white')
fig.show()

In [None]:
fig_word_count = ff.create_distplot([df[df['labels'] == 1]['word_count'], df[df['labels'] == 0]['word_count']], ['Positive', 'Negative'], bin_size=20, show_rug=False, show_hist=False)
fig_char_count = ff.create_distplot([df[df['labels'] == 1]['char_count'], df[df['labels'] == 0]['char_count']], ['Positive', 'Negative'], bin_size=20, show_rug=False, show_hist=False)

# fig_word_count.update_layout(title_text='Word Count Distribution', xaxis_title_text='Word Count', yaxis_title_text='Frequency')
# fig_char_count.update_layout(title_text = 'Char Count Distribution', xaxis_title_text = 'Char Count', yaxis_title_text='Frequency')
# fig_word_count.show()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Word Count Distribution', 'Char Count Distribution'))
fig.add_trace(fig_word_count.data[0], row=1, col=1)
fig.add_trace(fig_word_count.data[1], row=1, col=1)
fig.add_trace(fig_char_count.data[0], row=1, col=2)
fig.add_trace(fig_char_count.data[1], row=1, col=2)

fig.update_layout(title_text='Word Count and Char Count Distribution')
fig.show()

# Neural Network | One Hot Encoding

In [None]:
batch_size = 32
epoch = 2

In [None]:
vec1 = CountVectorizer(binary=True)
vec_tweets1 = vec1.fit_transform(df['clean_tweets']).toarray()
x_train, x_test, y_train, y_test = train_test_split(vec_tweets1, df['labels'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test,y_test,test_size=0.5)

i = Input(shape=(vec_tweets1.shape[1],))
x = Dense(256, activation='relu')(i)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
history1 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, epochs=epoch)

In [None]:
model.evaluate(x_test, y_test)

# Neural Network | Count Vectorization

In [None]:
vec2 = CountVectorizer(binary=False)
vec_tweets2 = vec2.fit_transform(df['clean_tweets']).toarray()
x_train, x_test, y_train, y_test = train_test_split(vec_tweets2, df['labels'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test,y_test,test_size=0.5)

i = Input(shape=(vec_tweets2.shape[1],))
x = Dense(256, activation='relu')(i)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
history2 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, epochs=epoch)

In [None]:
model.evaluate(x_test, y_test)

# Neural Network | TF-IDF Vectorizaton

In [None]:
vec3 = CountVectorizer(binary=False)
vec_tweets3 = vec3.fit_transform(df['clean_tweets']).toarray()
x_train, x_test, y_train, y_test = train_test_split(vec_tweets3, df['labels'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test,y_test,test_size=0.5)

i = Input(shape=(vec_tweets3.shape[1],))
x = Dense(256, activation='relu')(i)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
history3 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, epochs=epoch)

In [None]:
model.evaluate(x_test, y_test)

# Neural Network | Index Based Encoding

## Main

In [None]:
seq_len = df['word_count'].max()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_tweets'])
word_index = tokenizer.word_index

enc_tweets = tokenizer.texts_to_sequences(df['clean_tweets'])
enc_tweets = pad_sequences(enc_tweets, padding='post', maxlen=seq_len)

x_train, x_test, y_train, y_test = train_test_split(enc_tweets, df['labels'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test,y_test,test_size=0.5)

i = Input(shape=(enc_tweets.shape[1],))
x = Dense(256, activation='relu')(i)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
history4 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, epochs=epoch)

In [None]:
model.evaluate(x_test,y_test)

## Neural Network | Word Embedding with Text Sequence (Index Based Encoding)

In [None]:
i = Input(shape=(enc_tweets.shape[1],))
x = Embedding(len(word_index)+1, 50)(i)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dense(128, activation='relu')(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])
history5 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=512, epochs=10)

In [None]:
model.evaluate(x_test, y_test)

In [None]:
len(word_index)

## Neural Network | Embedding + SimpleRNN

In [None]:
i = Input(shape=(enc_tweets.shape[1],))
x = Embedding(len(word_index)+1, 100)(i)
x = SimpleRNN(100, dropout= 0.2, return_sequences=True)(x)
x = SimpleRNN(50)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = Adam(learning_rate=0.0007), loss = 'binary_crossentropy', metrics = ['accuracy'])
history6 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, epochs=10)

In [None]:
model.evaluate(x_test, y_test)

## Neural Network | Embedding + LSTM

In [None]:
i = Input(shape=(enc_tweets.shape[1],))
x = Embedding(len(word_index)+1, 100)(i)
x = LSTM(100, dropout= 0.2, return_sequences=True)(x)
x = LSTM(50)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])
history7 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=32, epochs=10)

In [None]:
model.evaluate(x_test, y_test)

## Neural Network | Embedding + GRU

In [None]:
i = Input(shape=(enc_tweets.shape[1],))
x = Embedding(len(word_index)+1, 100)(i)
x = GRU(100, dropout= 0.2, return_sequences=True)(x)
x = GRU(50)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])
history7 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=32, epochs=10)

In [None]:
model.evaluate(x_test, y_test)

## Neural Network | Embedding + Bidirectional(Updated Complexity)

### Bidirectional + LSTM

In [None]:
i = Input(shape=(enc_tweets.shape[1],))
x = Embedding(len(word_index)+1, 300)(i)


x = Bidirectional(LSTM(300, dropout= 0.2, return_sequences=True))(x)
x = Conv1D(256, 5, activation='relu')(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)


x = Bidirectional(LSTM(200, return_sequences=True))(x)
x = Conv1D(128, 5, activation='relu')(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)


x = Bidirectional(LSTM(100, return_sequences=True))(x)
x = Conv1D(64, 5, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)


x = GlobalMaxPool1D()(x)


x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])
history7 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=512, epochs=30)

In [None]:
model.evaluate(x_test, y_test)

### Bidirectional + GRU

In [None]:
i = Input(shape=(enc_tweets.shape[1],))
x = Embedding(len(word_index)+1, 100)(i)
x = Bidirectional(GRU(100, dropout= 0.2, return_sequences=True))(x)
x = Bidirectional(GRU(50))(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i, x)
model.compile(optimizer = Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])
history7 = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=32, epochs=10)

In [None]:
model.evaluate(x_test, y_test)