**Remarks**

In this notebook, we will learn about deep analysis in twitter-sentiment. Yeayy! You must review my notebook to gain more knowledge about what I was written. I used different method: Deep Learning, RNN, and LSTM. Let's check it out!

In [None]:
!python -m spacy download en_core_web_lg

In [3]:
# import library
# 1. dataframe environment
import numpy as np 
import pandas as pd

# 2. NLP tools
import spacy
import en_core_web_lg
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
import re
from bs4 import BeautifulSoup
import unicodedata
from textblob import TextBlob

# 3. Modelling tools
from sklearn.model_selection import train_test_split

# 4. Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# 5. Keras Environment
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D
from keras.layers import MaxPooling1D, GlobalMaxPooling1D
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

In [4]:
# NLP Modelling and function
nlp = en_core_web_lg.load()
def get_wordcounts(x):
  length = len(str(x).split())
  return length

def get_charcounts(x):
  s = x.split()
  x = ''.join(s)
  return len(x)

def get_avg_wordlength(x):
	count = get_charcounts(x)/get_wordcounts(x)
	return count

def get_stopwords_counts(x):
	l = len([t for t in x.split() if t in stopwords])
	return l

def get_hashtag_counts(x):
	l = len([t for t in x.split() if t.startswith('#')])
	return l

def get_mentions_counts(x):
	l = len([t for t in x.split() if t.startswith('@')])
	return l

def get_digit_counts(x):
	digits = re.findall(r'[0-9,.]+', x)
	return len(digits)

def get_digit_counts(x):
	digits = re.findall(r'[0-9,.]+', x)
	return len(digits)

def get_uppercase_counts(x):
  x = len([t for t in x.split() if t.isupper()])
  return x

def get_emails(x):
	emails = re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', x)
	counts = len(emails)

	return counts, emails


def get_urls(x):
	urls = re.findall(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)
	counts = len(urls)

	return counts, urls

def make_base(x):
	x = str(x)
	x_list = []
	doc = nlp(x)
	
	for token in doc:
		lemma = token.lemma_
		if lemma == '-PRON-' or lemma == 'be':
			lemma = token.text

		x_list.append(lemma)
	return ' '.join(x_list)

def get_value_counts(df, col):
	text = ' '.join(df[col])
	text = text.split()
	freq = pd.Series(text).value_counts()
	return freq

def spelling_correction(x):
	x = TextBlob(x).correct()
	return x

def get_basic_features(df):
	if type(df) == pd.core.frame.DataFrame:
		df['char_counts'] = df['text'].apply(lambda x:get_charcounts(x))
		df['word_counts'] = df['text'].apply(lambda x:get_wordcounts(x))
		df['avg_wordlength'] = df['text'].apply(lambda x:get_avg_wordlength(x))
		df['stopwords_counts'] = df['text'].apply(lambda x:get_stopwords_counts(x))
		df['hashtag_counts'] = df['text'].apply(lambda x:get_hashtag_counts(x))
		df['mentions_counts'] = df['text'].apply(lambda x:get_mentions_counts(x))
		df['digits_counts'] = df['text'].apply(lambda x:get_digit_counts(x))
		df['uppercase_counts'] = df['text'].apply(lambda x:get_uppercase_counts(x))
	else:
		print('ERROR: This function takes only Pandas DataFrame')

In [5]:
# retrieving from Kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d cosmos98/twitter-and-reddit-sentimental-analysis-dataset
!unzip "twitter-and-reddit-sentimental-analysis-dataset.zip"

Downloading twitter-and-reddit-sentimental-analysis-dataset.zip to /content
 50% 5.00M/10.0M [00:00<00:00, 24.5MB/s]
100% 10.0M/10.0M [00:00<00:00, 39.6MB/s]
Archive:  twitter-and-reddit-sentimental-analysis-dataset.zip
  inflating: Reddit_Data.csv         
  inflating: Twitter_Data.csv        


In [7]:
df_tweet = pd.read_csv("/content/Twitter_Data.csv")
df_tweet.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [9]:
# checking null
df_tweet.isnull().sum()

clean_text    4
category      7
dtype: int64

In [10]:
# dropping missing rows
df_tweet.dropna(axis=0, inplace=True)

In [11]:
# counting category
df_tweet.category.value_counts()
# NOTE:
# 1 -- Positive
# 0 -- Neutral
# -1 -- Negative

 1.0    72249
 0.0    55211
-1.0    35509
Name: category, dtype: int64

In [14]:
# splitting data
y_tweet = df_tweet['category']
X_tweet = df_tweet['clean_text']

# tokenization
token = Tokenizer()
token.fit_on_texts(X_tweet)

# unique words
vocab_size = len(token.word_index)+1
print("Vocab size: {}".format(vocab_size))

# encoded text to numerical
encoded_text = token.texts_to_sequences(X_tweet)

max_lenth=120
X_tweet_final = pad_sequences(encoded_text,maxlen=max_lenth,padding='post')

print("Dataframe ", X_tweet_final.shape)

Vocab size: 113679
Dataframe  (162969, 120)


In [8]:
df_red = pd.read_csv("/content/Reddit_Data.csv")
df_red.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [12]:
# checking null
df_red.isnull().sum()

clean_comment    100
category           0
dtype: int64

In [13]:
# dropping missing rows
df_red.dropna(axis=0, inplace=True)
# counting category
df_red.category.value_counts()
# NOTE:
# 1 -- Positive
# 0 -- Neutral
# -1 -- Negative

 1    15830
 0    13042
-1     8277
Name: category, dtype: int64

**DEEP LEARNING**

It is the first step analysis. Enjoy learn!

In [15]:
# train test split Data Tweet
X_train, X_test, y_train, y_test = train_test_split(X_tweet_final, y_tweet, test_size=0.2, random_state=42, stratify=y_tweet)

In [16]:
# deeplearning model
vec_size = 300
model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=120))

# model 1
model.add(Conv1D(32, 3, activation="relu"))
model.add(MaxPooling1D(3))
model.add(Dropout(0.2))

# model 2
model.add(Conv1D(64, 3, activation="relu"))
model.add(MaxPooling1D(3))
model.add(Dropout(0.2))

# model 3
model.add(Conv1D(128, 3, activation="relu"))
model.add(GlobalMaxPooling1D())

# final
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 300)          34103700  
_________________________________________________________________
conv1d (Conv1D)              (None, 118, 32)           28832     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 39, 32)            0         
_________________________________________________________________
dropout (Dropout)            (None, 39, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 37, 64)            6208      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 64)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 64)            0

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

%time
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(5)

plt.plot(epochs, acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.legend()
plt.show()

**RNN Analysis**

It is the second step analysis. Enjoy learn!

In [None]:
print("shape of X_train: ",X_train.shape)
print("shape of X_test: ",X_test.shape)
print("shape of y_train: ",y_train.shape)
print("shape of y_test: ",y_test.shape)

In [None]:
vec_size = 300
model = sequential()
model.add(Embedding(vocab_size, vec_size, input_length=120))
model.add(SimpleRNN(50, retun_sequences = False))
model.add(Dense(1, activation='sigmoid'))
model.summary()

adam = optimizers.Adam(lr=0.001)

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
%time
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(5)

plt.plot(epochs, acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.legend()
plt.show()

**LSTM Analysis**

It is the last step analysis, but not least. Enjoy learn!

In [None]:
# LSTM support library
import tensorflow as tf
from tensorflow.keras import regularizers, layers, losses

In [None]:
vec_size = 300
# layer 1
model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=120, embeddings_regularizer=regularizer.l2(0.005)))
model.add(Dropout(0.4))

# layer 2
model.add(LSTM(vec_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True,
               kernel_regularizer=regularizers.l2(0.005), bias_regularizer=regularizers.l2(0.005)))
model.add(Flatten())

# layer 3
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.4))

# layer 4
model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.001),
                bias_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.4))

# last layer
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
%time
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(5)

plt.plot(epochs, acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.legend()
plt.show()