In [1]:
# for string matching
import re 

# for reading data
import pandas as pd

# for handling html data
from bs4 import BeautifulSoup

# for visualization
import matplotlib.pyplot as plt  

pd.set_option('display.max_colwidth', 200)

In [2]:
# load the stackoverflow questions dataset
train = pd.read_csv('train.csv',encoding='latin-1')

# load the tags dataset
test = pd.read_csv('test.csv',encoding='latin-1')

In [3]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperiasâ¦ http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [4]:
def cleaner(text):

  # take off html tags
  text = BeautifulSoup(text).get_text()
  
  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()

  # split text into tokens to remove whitespaces
  tokens = text.split()

  return " ".join(tokens)

In [5]:
# call preprocessing function
train['cleaned_text'] = train['tweet'].apply(cleaner)



In [6]:
train.head()

Unnamed: 0,id,label,tweet,cleaned_text
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test https goo gl h mfqv android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperiasâ¦ http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias http instagram com p yget jc jm
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect http fb me n lsupcu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i m wired i know i m george i was made that way iphone cute daventry home http instagr am p li ujs k
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple won t even talk to me about a question i have unless i pay them for their stupid support


In [7]:
# List of features
x = train[['id', 'cleaned_text']]
y = train['label']

In [8]:
train['label'].value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val=train_test_split(x, y, test_size=0.2, random_state=0,shuffle=True)

In [10]:
import numpy as np
# Our vectorized labels
y_train = np.asarray(y_tr).astype('float32').reshape((-1,1))
y_testval = np.asarray(y_val).astype('float32').reshape((-1,1))

In [11]:
y_train

array([[0.],
       [1.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [12]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from keras.preprocessing.sequence import pad_sequences 

#prepare a tokenizer
x_tokenizer = Tokenizer() 

x_tokenizer.fit_on_texts(x_tr['cleaned_text'])

In [13]:
x_tokenizer.word_index

{'iphone': 1,
 'http': 2,
 'com': 3,
 'apple': 4,
 'p': 5,
 'i': 6,
 'my': 7,
 'instagram': 8,
 'the': 9,
 'to': 10,
 'a': 11,
 'samsung': 12,
 'it': 13,
 'twitter': 14,
 'and': 15,
 's': 16,
 'for': 17,
 'new': 18,
 'me': 19,
 'you': 20,
 'https': 21,
 'phone': 22,
 'is': 23,
 'am': 24,
 'sony': 25,
 'instagr': 26,
 'follow': 27,
 'in': 28,
 't': 29,
 'on': 30,
 'of': 31,
 'this': 32,
 'pic': 33,
 'www': 34,
 'with': 35,
 'ipad': 36,
 'like': 37,
 'so': 38,
 'love': 39,
 'just': 40,
 'at': 41,
 'ios': 42,
 'have': 43,
 'android': 44,
 'life': 45,
 'm': 46,
 'rt': 47,
 'now': 48,
 'that': 49,
 'all': 50,
 'your': 51,
 'day': 52,
 'an': 53,
 'ly': 54,
 'can': 55,
 'not': 56,
 'photo': 57,
 'cute': 58,
 'case': 59,
 'get': 60,
 'photography': 61,
 'gain': 62,
 'galaxy': 63,
 'be': 64,
 'app': 65,
 'back': 66,
 'today': 67,
 'news': 68,
 'got': 69,
 'from': 70,
 'fun': 71,
 'd': 72,
 'music': 73,
 'itunes': 74,
 'bit': 75,
 'instagood': 76,
 'out': 77,
 'happy': 78,
 'time': 79,
 'fashion

In [14]:
len(x_tokenizer.word_index)

19422

In [15]:
thresh = 3

cnt=0
for key,value in x_tokenizer.word_counts.items():
  if value>=thresh:
    cnt=cnt+1

print(cnt)

3894


In [16]:
# prepare the tokenizer again
x_tokenizer = Tokenizer(num_words=cnt,oov_token='unk')

#prepare vocabulary
x_tokenizer.fit_on_texts((x_tr['cleaned_text']))

In [17]:
x_tr

Unnamed: 0,id,cleaned_text
862,863,good birthday purchase happy with it applewatch spacegray applewatchsport sport apple https instagram com p pp ylpakb
6249,6250,iphone screwed hung on update now seems to be hung on restoring iphone software apple ihateapple macssuck for good measure
2761,2762,sunset one minute ago samsung europe zeeland brouwersdam stormhour earthandclouds mooieluchten pic twitter com ad ycobxh
7496,7497,yeah bb sucks iphone rt m nstre i love how since i ve had my bb i have yet to send images without a single issue f k you bb
2141,2142,how can the iphone seriously go from to in an hour you iphone apple
...,...,...
4931,4932,call me it android pc computers tuesdaythoughts apple technology tech news support help devices now service entrepreneur blackbusiness harmony balance techtuesday davidgift business art artist lif...
3264,3265,my phone sucks it s not sending or receiving my texts apple
1653,1654,there s pumpkin spice in my cup starbucks psl candykay iphoneography iphone iphone starbucks http instagr am p l ytp
2607,2608,experience a magical and fun filled rollercoaster ride on your samsung gearvr a world filled with dragons candies cute cats cakes that will make your day app link https goo gl ck l k oculus candyc...


In [18]:
# maximum sequence length allowed
max_len = 100

#convert text sequences into integer sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_tr['cleaned_text']) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val['cleaned_text'])

#padding up with zero 
x_tr_seq = pad_sequences(x_tr_seq,  padding='post', maxlen=max_len)
x_val_seq = pad_sequences(x_val_seq, padding='post', maxlen=max_len)

In [19]:
x_tr_seq

array([[ 107,  101,  809, ...,    0,    0,    0],
       [   2, 2983,    1, ...,    0,    0,    0],
       [ 331,   97,  255, ...,    0,    0,    0],
       ...,
       [ 227,   17,    1, ...,    0,    0,    0],
       [1019,   12,    1, ...,    0,    0,    0],
       [  19,  792,   82, ...,    0,    0,    0]])

In [20]:
x_val_seq

array([[  26, 1379,  156, ...,    0,    0,    0],
       [ 928,   11,  115, ...,    0,    0,    0],
       [ 104,   41,  548, ...,    0,    0,    0],
       ...,
       [2714,   17,    1, ...,    0,    0,    0],
       [  19,   23,   26, ...,    0,    0,    0],
       [   5,    2,  533, ...,    0,    0,    0]])

In [21]:
from keras.models import *
from keras.layers import *
from keras.callbacks import *

In [22]:
#no. of unique words
x_voc_size = x_tokenizer.num_words + 1
x_voc_size

3895

In [23]:
#sequential model
model = Sequential()

#embedding layer
model.add(Embedding(x_voc_size, 50, input_shape=(max_len,), mask_zero=True))

#rnn layer
model.add(SimpleRNN(128,activation='relu'))

#dense layer
model.add(Dense(128,activation='relu')) 

#output layer
model.add(Dense(1,activation='sigmoid'))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           194750    
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               22912     
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 234,303
Trainable params: 234,303
Non-trainable params: 0
_________________________________________________________________


In [25]:
#define optimizer and loss
model.compile(optimizer='adam',loss='binary_crossentropy')

In [26]:
# checkpoint to save best model during training
mc = ModelCheckpoint("weights.best.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [46]:
#train the model 
model.fit(x_tr_seq, y_train, batch_size=128, epochs=10, verbose=1, validation_data=(x_val_seq, y_testval), callbacks=[mc])

Epoch 1/10
Epoch 1: val_loss did not improve from 0.27615
Epoch 2/10
Epoch 2: val_loss did not improve from 0.27615
Epoch 3/10
Epoch 3: val_loss did not improve from 0.27615
Epoch 4/10
Epoch 4: val_loss did not improve from 0.27615
Epoch 5/10
Epoch 5: val_loss did not improve from 0.27615
Epoch 6/10
Epoch 6: val_loss did not improve from 0.27615
Epoch 7/10
Epoch 7: val_loss did not improve from 0.27615
Epoch 8/10
Epoch 8: val_loss did not improve from 0.27615
Epoch 9/10
Epoch 9: val_loss did not improve from 0.27615
Epoch 10/10
Epoch 10: val_loss did not improve from 0.27615


<keras.callbacks.History at 0x2366e426dc0>

In [47]:
# load weights into new model
model.load_weights("weights.best.hdf5")

#predict probabilities
pred_prob = model.predict(x_val_seq)



In [48]:
# convert probabilities into classes or tags based on a threshold value
def classify(pred_prob,thresh):
  y_pred_seq = []

  for i in pred_prob:
    temp=[]
    for j in i:
      if j>=thresh:
        temp.append(1)
      else:
        temp.append(0)
    y_pred_seq.append(temp)

  return y_pred_seq

In [49]:
from sklearn import metrics

In [50]:
import numpy as np
#define candidate threshold values
threshold  = np.arange(0,0.5,0.01)
threshold

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49])

In [51]:
from sklearn import metrics
score=[]

#convert to 1 array
y_true = np.array(y_val).ravel() 

for thresh in threshold:
    
    #classes for each threshold
    y_pred_seq = classify(pred_prob,thresh) 

    #convert to 1d array
    y_pred = np.array(y_pred_seq).ravel()

    score.append(metrics.f1_score(y_true,y_pred))

In [52]:
opt = threshold[score.index(max(score))]
opt

0.29

In [53]:
#predictions for optimal threshold
y_pred_seq = classify(pred_prob,opt)
y_pred = np.array(y_pred_seq).ravel()

In [54]:
print(metrics.classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      1178
           1       0.72      0.89      0.79       406

    accuracy                           0.88      1584
   macro avg       0.84      0.88      0.86      1584
weighted avg       0.90      0.88      0.89      1584



In [55]:
from sklearn.metrics import f1_score

In [56]:
f1_score(y_true, y_pred)

0.7942794279427942

In [57]:
# call preprocessing function
test['cleaned_text'] = test['tweet'].apply(cleaner)



In [58]:
test_tokenizer = Tokenizer() 

test_tokenizer.fit_on_texts(test['cleaned_text'])

In [59]:
test_tokenizer.word_index

{'iphone': 1,
 'http': 2,
 'com': 3,
 'apple': 4,
 'p': 5,
 'i': 6,
 'my': 7,
 'instagram': 8,
 'to': 9,
 'the': 10,
 'a': 11,
 'samsung': 12,
 'and': 13,
 'it': 14,
 's': 15,
 'new': 16,
 'me': 17,
 'for': 18,
 'twitter': 19,
 'am': 20,
 'you': 21,
 'phone': 22,
 'https': 23,
 'is': 24,
 'in': 25,
 'sony': 26,
 'of': 27,
 'instagr': 28,
 't': 29,
 'this': 30,
 'on': 31,
 'follow': 32,
 'with': 33,
 'www': 34,
 'pic': 35,
 'like': 36,
 'ipad': 37,
 'so': 38,
 'love': 39,
 'that': 40,
 'just': 41,
 'have': 42,
 'an': 43,
 'at': 44,
 'm': 45,
 'all': 46,
 'your': 47,
 'life': 48,
 'ly': 49,
 'now': 50,
 'get': 51,
 'ios': 52,
 'rt': 53,
 'day': 54,
 'can': 55,
 'music': 56,
 'app': 57,
 'today': 58,
 'cute': 59,
 'out': 60,
 'android': 61,
 'gain': 62,
 'photo': 63,
 'galaxy': 64,
 'not': 65,
 'back': 66,
 'be': 67,
 'but': 68,
 'got': 69,
 'are': 70,
 'happy': 71,
 'from': 72,
 'case': 73,
 'who': 74,
 'time': 75,
 'one': 76,
 'news': 77,
 'fun': 78,
 'bit': 79,
 'no': 80,
 'when': 81,


In [60]:
# prepare the tokenizer again
test_tokenizer = Tokenizer(num_words=cnt,oov_token='unk')

#prepare vocabulary
test_tokenizer.fit_on_texts((test['cleaned_text']))

In [61]:
x_test_seq = x_tokenizer.texts_to_sequences(test['cleaned_text']) 

In [64]:
x_test_seq

array([[   7,  158,   10, ...,    0,    0,    0],
       [1976,    1,    8, ...,    0,    0,    0],
       [   7,   73,   38, ...,    0,    0,    0],
       ...,
       [ 227,  693,   12, ...,    0,    0,    0],
       [   1,  130,    7, ...,    0,    0,    0],
       [ 106,    7,   70, ...,    0,    0,    0]])

In [63]:
#padding up with zero 
x_test_seq = pad_sequences(x_test_seq,  padding='post', maxlen=max_len)

In [65]:
pred_prob_test = model.predict(x_test_seq)



In [68]:
#predictions for optimal threshold
y_pred_seq_test = classify(pred_prob_test,opt)
y_pred_test = np.array(y_pred_seq_test).ravel()

In [69]:
y_pred_test

array([1, 0, 1, ..., 1, 1, 0])

In [70]:
test.head()

Unnamed: 0,id,tweet,cleaned_text
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks,i hate the new iphone upgrade won t let me download apps ugh apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/,currently shitting my fucking pants apple imac cashmoney raddest swagswagswag http instagr am p uuis bibzo
2,7923,"I'd like to puts some CD-ROMS on my iPad, is that possible?' â Yes, but wouldn't that block the screen?\n",i d like to puts some cd roms on my ipad is that possible yes but wouldn t that block the screen
3,7924,"My ipod is officially dead. I lost all my pictures and videos from the 1D and 5sos concert,and from Vet Camp #hatinglife #sobbing",my ipod is officially dead i lost all my pictures and videos from the d and sos concert and from vet camp hatinglife sobbing
4,7925,Been fighting iTunes all night! I only want the music I $&@*# paid for,been fighting itunes all night i only want the music i paid for


In [71]:
test['label'] = y_pred_test

In [72]:
test.head()

Unnamed: 0,id,tweet,cleaned_text,label
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks,i hate the new iphone upgrade won t let me download apps ugh apple sucks,1
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/,currently shitting my fucking pants apple imac cashmoney raddest swagswagswag http instagr am p uuis bibzo,0
2,7923,"I'd like to puts some CD-ROMS on my iPad, is that possible?' â Yes, but wouldn't that block the screen?\n",i d like to puts some cd roms on my ipad is that possible yes but wouldn t that block the screen,1
3,7924,"My ipod is officially dead. I lost all my pictures and videos from the 1D and 5sos concert,and from Vet Camp #hatinglife #sobbing",my ipod is officially dead i lost all my pictures and videos from the d and sos concert and from vet camp hatinglife sobbing,1
4,7925,Been fighting iTunes all night! I only want the music I $&@*# paid for,been fighting itunes all night i only want the music i paid for,1


In [73]:
test_final = test.drop(['tweet','cleaned_text',],axis=1)

In [74]:
test_final.head()

Unnamed: 0,id,label
0,7921,1
1,7922,0
2,7923,1
3,7924,1
4,7925,1


In [75]:
test_final.to_csv('lokesh_sentiment_analysis.csv')