In [1]:
# Twitter Sentiment Analysis

# Twitter Sentiment Analysis

## Faizan Hameed

### 1-Importing the required modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import tensorflow as tf
from textblob import TextBlob
import re
import tweepy
from wordcloud import WordCloud
from keras.models import model_from_json
import string

In [4]:
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

### 2-Loading Data

In [5]:
# Make data directory if it doesn't exist
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

--2021-02-17 01:41:29--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85088192 (81M) [application/zip]
Saving to: ‘data/training.1600000.processed.noemoticon.csv.zip’


2021-02-17 01:41:31 (35.0 MB/s) - ‘data/training.1600000.processed.noemoticon.csv.zip’ saved [85088192/85088192]

Archive:  data/training.1600000.processed.noemoticon.csv.zip
  inflating: data/training.1600000.processed.noemoticon.csv  


Firstst we import our data and rename our columns

In [6]:
#importing the data and renaming the columns

df = pd.read_csv("data/training.1600000.processed.noemoticon.csv",
                   encoding='latin-1')#Here latin-1 encoding is used to read the file

### 3-Preprocessing the Data

Visualizing the data

In [7]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Renaming columns to make them easy to understand.

In [8]:
df.columns=['target','id','date','query','user','text']
df.head()

Unnamed: 0,target,id,date,query,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Removing Unwanted Columns





In [9]:
df = df.drop(columns=['id', 'date', 'query', 'user']) # These columns will not effect our data so we are going to remove them
df.head()

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


### Checking the classis

In [10]:
df.target.value_counts() #different classis in column target

4    800000
0    799999
Name: target, dtype: int64

In the upper code 0 is for negitive and 4 is for positive. For better representation change 4 with 1 so that 1 for positive and 0 for negitive representaton.

In [11]:
df.target=df.target.replace({0:0,4:1})
df.target.value_counts()

1    800000
0    799999
Name: target, dtype: int64

Cleaning the data 

In [12]:
#first we will clean unwanted text from tweets using regular expressions
def cleaning_tweets(tweet):
  tweet=tweet.lower()# converting into lower case
  tweet=re.sub(r'http\S+|https\S+|www\S+','',tweet,flags=re.MULTILINE) #removing urls
  tweet= tweet.translate(str.maketrans('','',string.punctuation)) #translate method is used to remove punctuations and replace with space
  tweet=re.sub(r'\@\w+|\#',"",tweet)#removing # and @ references from tweets
  tweet=re.sub(r'@[A-Za-z0-9]+','',tweet)
  tweet=re.sub(r'@[A-Za-zA-Z0-9]+','',tweet)
  tweet=re.sub(r'@[A-Za-z]+','',tweet)
  tweet=re.sub(r'@[-)]+','',tweet)
  return tweet


In [13]:
df.text=df.text.apply(cleaning_tweets)
df.text.head()

0    is upset that he cant update his facebook by t...
1    kenichan i dived many times for the ball manag...
2      my whole body feels itchy and like its on fire 
3    nationwideclass no its not behaving at all im ...
4                         kwesidei not the whole crew 
Name: text, dtype: object

In [14]:
df['text'].values

array(['is upset that he cant update his facebook by texting it and might cry as a result  school today also blah',
       'kenichan i dived many times for the ball managed to save 50  the rest go out of bounds',
       'my whole body feels itchy and like its on fire ', ...,
       'are you ready for your mojo makeover ask me for details ',
       'happy 38th birthday to my boo of alll time tupac amaru shakur ',
       'happy charitytuesday thenspcc sparkscharity speakinguph4h '],
      dtype=object)

#### Tokenization

We are tokenizing text to convert them into vectors

In [15]:
tokenize = Tokenizer(num_words=1500, split=' ')#Vocabulary size is 1500
tokenize.fit_on_texts(df['text'].values)
word_index = tokenize.word_index


In [16]:
word_index

{'i': 1,
 'to': 2,
 'the': 3,
 'a': 4,
 'my': 5,
 'and': 6,
 'you': 7,
 'is': 8,
 'it': 9,
 'for': 10,
 'in': 11,
 'of': 12,
 'im': 13,
 'on': 14,
 'me': 15,
 'so': 16,
 'have': 17,
 'that': 18,
 'but': 19,
 'just': 20,
 'with': 21,
 'be': 22,
 'at': 23,
 'its': 24,
 'not': 25,
 'was': 26,
 'this': 27,
 'now': 28,
 'good': 29,
 'up': 30,
 'day': 31,
 'get': 32,
 'all': 33,
 'out': 34,
 'like': 35,
 'are': 36,
 'no': 37,
 'go': 38,
 'dont': 39,
 'do': 40,
 'your': 41,
 'today': 42,
 'too': 43,
 'going': 44,
 'love': 45,
 'work': 46,
 'cant': 47,
 'got': 48,
 'time': 49,
 'from': 50,
 'back': 51,
 'lol': 52,
 'u': 53,
 'what': 54,
 'one': 55,
 'will': 56,
 'know': 57,
 'we': 58,
 'about': 59,
 'really': 60,
 'am': 61,
 'had': 62,
 'can': 63,
 'see': 64,
 'some': 65,
 'well': 66,
 'if': 67,
 'still': 68,
 'want': 69,
 'new': 70,
 'night': 71,
 'how': 72,
 'think': 73,
 'amp': 74,
 'thanks': 75,
 'home': 76,
 'as': 77,
 'when': 78,
 'there': 79,
 'oh': 80,
 '2': 81,
 'more': 82,
 'much': 8

Word Embedding is technique to represent words into numbers

In [17]:
emb = tokenize.texts_to_sequences(df['text'])
x = pad_sequences(emb)# after padding we assign it an "x" variable

In [18]:
print(x)

[[   0    0    0 ...   42  261 1197]
 [   0    0    0 ...   38   34   12]
 [   0    0    0 ...   24   14 1172]
 ...
 [   0    0    0 ...  612   15   10]
 [   0    0    0 ...  500   12   49]
 [   0    0    0 ...    0    0  118]]


In [28]:
model = Sequential()

model.add(Embedding(1500,32, embed_dim=128,input_length = x.shape[1]))# embed_dim is a hyperparameter we selected its value as 128, 1500 is vocab sizae or max features
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim,return_sequences=True)))
model.add(tf.keras.layers.Dense(6, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.add(LSTM(lstm_out=196, dropout=0.2, recurrent_dropout=0.2))

TypeError: ignored