# Twitter Sentiment Analysis

## Faizan Hameed

### 1-Importing the required modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import tensorflow as tf
import re
from wordcloud import WordCloud
from keras.models import model_from_json
import string

In [None]:
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### 2-Loading Data

In [None]:
# Make data directory if it doesn't exist
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

File ‘data/training.1600000.processed.noemoticon.csv.zip’ already there; not retrieving.

Archive:  data/training.1600000.processed.noemoticon.csv.zip


First we import our data and rename our columns

In [None]:
#importing the data and renaming the columns

df = pd.read_csv("data/training.1600000.processed.noemoticon.csv",
                   encoding='latin-1')#Here latin-1 encoding is used to read the file

### 3-Preprocessing the Data

Visualizing the data

In [None]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Renaming columns to make them easy to understand.

In [None]:
df.columns=['target','id','date','query','user','text']
df.head()

Unnamed: 0,target,id,date,query,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Removing Unwanted Columns





In [None]:
df = df.drop(columns=['id', 'date', 'query', 'user']) # These columns will not effect our data so we are going to remove them
df.head()

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


Now we want to check the classis

In [None]:
df.target.value_counts() #different classis in column target

4    800000
0    799999
Name: target, dtype: int64

In the upper code 0 is for negitive and 4 is for positive. For better representation change 4 with 1 so that 1 for positive and 0 for negitive representaton.

In [None]:
df.target=df.target.replace({0:0,4:1})
df.target.value_counts()

1    800000
0    799999
Name: target, dtype: int64

Cleaning the data 

In [None]:
#first we will clean unwanted text from tweets using regular expressions
def cleaning_tweets(tweet):
  tweet=tweet.lower()# converting into lower case
  tweet=re.sub(r'http\S+|https\S+|www\S+','',tweet,flags=re.MULTILINE) #removing urls
  tweet= tweet.translate(str.maketrans('','',string.punctuation)) #translate method is used to remove punctuations and replace with space
  tweet=re.sub(r'\@\w+|\#',"",tweet)#removing # and @ references from tweets
  tweet=re.sub(r'@[A-Za-z0-9]+','',tweet)
  tweet=re.sub(r'@[A-Za-zA-Z0-9]+','',tweet)
  tweet=re.sub(r'@[A-Za-z]+','',tweet)
  tweet=re.sub(r'@[-)]+','',tweet)
  return tweet


In [None]:
df.text=df.text.apply(cleaning_tweets)
df.text.head()

0    is upset that he cant update his facebook by t...
1    kenichan i dived many times for the ball manag...
2      my whole body feels itchy and like its on fire 
3    nationwideclass no its not behaving at all im ...
4                         kwesidei not the whole crew 
Name: text, dtype: object

In [None]:
df['text'].values

array(['is upset that he cant update his facebook by texting it and might cry as a result  school today also blah',
       'kenichan i dived many times for the ball managed to save 50  the rest go out of bounds',
       'my whole body feels itchy and like its on fire ', ...,
       'are you ready for your mojo makeover ask me for details ',
       'happy 38th birthday to my boo of alll time tupac amaru shakur ',
       'happy charitytuesday thenspcc sparkscharity speakinguph4h '],
      dtype=object)

#### Tokenization

We are tokenizing text to convert them into vectors

In [None]:
tokenize = Tokenizer(num_words=1500, split=' ')#Vocabulary size is 1500
tokenize.fit_on_texts(df['text'].values)
word_index = tokenize.word_index


Word Embedding is technique to represent words into numbers

In [None]:
emb = tokenize.texts_to_sequences(df['text'])
x = pad_sequences(emb)# after padding we assign it an "x" variable

In [None]:
print(x)

[[   0    0    0 ...   42  261 1197]
 [   0    0    0 ...   38   34   12]
 [   0    0    0 ...   24   14 1172]
 ...
 [   0    0    0 ...  612   15   10]
 [   0    0    0 ...  500   12   49]
 [   0    0    0 ...    0    0  118]]


### 4-Building a Model

In [None]:
model = Sequential()
embed_dim = 128
lstm_out = 196
vocabSize=1500
model.add(Embedding(vocabSize, embed_dim,input_length = x.shape[1]))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(tf.keras.layers.Dense(6, activation='relu'))# can remove this layer depending on our results
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 40, 128)           192000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 1182      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 447,989
Trainable params: 447,989
Non-trainable params: 0
_________________________________________________________________
None


### Compiling the Model

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])#we want to track our accuracy metric


Label encoder is used here to encode the sentiments.Input x are the tweets while corresponding output(label) y is sentiment.We stored them x wwith its correspondind sentiment y


In [None]:
leben = LabelEncoder()
y = leben.fit_transform(df['target'])


### Splitting the Training and Test Set

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

### 5-Training our Model

In [None]:
epochs = 10
batch_size=32
model.fit(x_train, y_train,validation_data = (x_test,y_test),epochs=epochs, batch_size=batch_size)

Epoch 1/10

#### Saving the model

In [None]:
model_json = model.to_json()
with open("model_final.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("final_model.h5")

####Model Evaluation

In [None]:
model.evaluate(x_test,y_test)# checking the performance of our model

#### Make Predictions

In [None]:
print("Prediction: ",model.predict_classes(x_test[15:30]))# Making prediction of 15 rows

print("Labels: \n",y_test[15:30])# checking the labels of the same rows that are predicted