In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Starbucks.csv")

In [3]:
df.head()

Unnamed: 0,Tweets,Sentiments
0,I wish there was an app that told me how long ...,0
1,That s awesome No wonder its such a huge succe...,1
2,Damn i wish my girlfriend didn t still work at...,0
3,I think I deserve Starbucks today,1
4,starbucks reserve roastery at tropicana garden...,1


In [4]:
df['Tweets'][0]

'I wish there was an app that told me how long the line is at Starbuck'

In [5]:
df['Sentiments'][0]

0

In [7]:
df['Sentiments'].value_counts()

## here 0 represent negative sentiment about that tweet and 1 represent positive tweet 

1    134
0     66
Name: Sentiments, dtype: int64

In [8]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AASHU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### USING NLTK LIBRARY FOR DATA PREPROCESSING

In [13]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [14]:
ps = PorterStemmer()
## object used for stemming

In [21]:
## begin with the preprocessing with the text data

corpus = []

for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]',' ',df['Tweets'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word)for word in review if not word in stopwords.words('english')]
    
    review = ' '.join(review)
    
    corpus.append(review)
    
    
    
    

In [22]:
corpus

['wish app told long line starbuck',
 'awesom wonder huge success everyon know starbuck never co au oh wait',
 'damn wish girlfriend still work starbuck much dirt mani peopl woke feel messi today smh',
 'think deserv starbuck today',
 'starbuck reserv roasteri tropicana garden soon',
 'whoever paid breakfast coffe drive thru front starbuck love',
 'want breakfast starbuck',
 'sign door williamsvil store close unexpectedli',
 'starbuck near job close dayssss covid',
 'treat starbuck',
 'want work starbuck hire england click link bio detail job shi',
 'okay guy starbuck take order made whole day',
 'work home week want run get starbuck coffe drink coffe home',
 'mayb wear mask protect other realli much ask popul will risk run',
 'could alway ask boy learn recip thank later',
 'starbuck sbux short shorten coffe bean without go cibai cibai',
 'first time left sauga nearli month drop mom work guzzl starbuck coffe feel',
 'wanna rock black live matter shirt better starbuck',
 'starbuck al kh

### now we are using stop words which is present in count vectorizer

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 2000)

#### Now dividing our data with independent and dependent feature

In [24]:
X = cv.fit_transform(corpus).toarray()

In [25]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
y = df['Sentiments'].values

In [29]:
y

array([0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1], dtype=int64)

Applying multinomial naive bayes algorithm to separate both classes

In [31]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y , test_size = 0.25 , random_state = 10)

In [34]:
from sklearn.naive_bayes import MultinomialNB

sentiment_MNB_model = MultinomialNB()

sentiment_MNB_model.fit(X_train , y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
y_pred = sentiment_MNB_model.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score

In [37]:
accuracy_score(y_test,y_pred)

0.6

In [38]:
from sklearn.metrics import confusion_matrix

In [39]:
confusion_matrix(y_test,y_pred)

array([[ 7, 11],
       [ 9, 23]], dtype=int64)

### NOW USING LSTMS FOR BETTER PREDICTIONS

In [51]:
import tensorflow as tf

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout


In [52]:
vocabulary_size = 5000

In [53]:
one_hot_representation = [one_hot(words,vocabulary_size) for words in corpus]

In [54]:
one_hot_representation

[[468, 1813, 2521, 778, 2719, 1374],
 [2741, 477, 4279, 4938, 2990, 4981, 1374, 4681, 240, 2574, 2540, 388],
 [3439,
  468,
  3102,
  2976,
  3165,
  1374,
  4172,
  1824,
  1418,
  673,
  3881,
  3747,
  2444,
  1457,
  4415],
 [2145, 2329, 1374, 1457],
 [1374, 2, 94, 2382, 1422, 3320],
 [3511, 3504, 461, 2654, 3209, 2224, 535, 1374, 1002],
 [1627, 461, 1374],
 [1277, 4459, 1896, 3062, 1506, 3472],
 [1374, 437, 1083, 1506, 3831, 145],
 [3863, 1374],
 [1627, 3165, 1374, 4964, 3163, 4073, 3660, 4221, 970, 1083, 1644],
 [3401, 702, 1374, 1176, 370, 3787, 1966, 324],
 [3165, 232, 4485, 1627, 3506, 3951, 1374, 2654, 4825, 2654, 232],
 [1820, 4117, 3117, 2867, 1099, 3350, 4172, 1636, 1196, 3646, 386, 3506],
 [1763, 2034, 1636, 624, 3410, 1645, 3007, 957],
 [1374, 4664, 196, 115, 2654, 1494, 565, 805, 3641, 3641],
 [3858,
  1829,
  4202,
  2628,
  1709,
  2644,
  2493,
  4768,
  3165,
  3984,
  1374,
  2654,
  3747],
 [1543, 3295, 3272, 873, 3095, 2516, 3803, 1374],
 [1374, 4939, 558, 4939, 

In [55]:
sentence_length = 20
embedded_values = pad_sequences(one_hot_representation , padding = 'post' , maxlen = sentence_length)

In [56]:
embedded_values

array([[ 468, 1813, 2521, ...,    0,    0,    0],
       [2741,  477, 4279, ...,    0,    0,    0],
       [3439,  468, 3102, ...,    0,    0,    0],
       ...,
       [2866, 3350,  468, ...,    0,    0,    0],
       [3429, 3863, 2647, ...,    0,    0,    0],
       [1374, 2654,  411, ...,    0,    0,    0]])

In [57]:
embedded_vector_features = 40
model = Sequential()
model.add(Embedding(vocabulary_size , embedded_vector_features , input_length = sentence_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1 , activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy' , optimizer = 'adam' , metrics  =['accuracy'])
model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


In [58]:
import numpy as np

x_final = np.array(embedded_values)
y_final = np.array(y)

In [59]:
x_final

array([[ 468, 1813, 2521, ...,    0,    0,    0],
       [2741,  477, 4279, ...,    0,    0,    0],
       [3439,  468, 3102, ...,    0,    0,    0],
       ...,
       [2866, 3350,  468, ...,    0,    0,    0],
       [3429, 3863, 2647, ...,    0,    0,    0],
       [1374, 2654,  411, ...,    0,    0,    0]])

In [60]:
y_final

array([0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1], dtype=int64)

In [70]:
from sklearn.model_selection import train_test_split
x_train2,x_test2 , y_train2 , y_test2 = train_test_split(x_final , y_final ,test_size = 0.25 ,random_state =30)

In [71]:
model.fit(x_train2,y_train2,validation_data = (x_test2 , y_test2) , epochs =20 , batch_size =32)

Train on 150 samples, validate on 50 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x223dfc04348>

In [72]:
y_pred2 = model.predict(x_test2)

In [74]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test2 , y_pred2)

0.9781746031746031

WE GOT A WONDERFUL ACCURACY USING LSTMS AND TRAIN IT OVER 20 EPOCHS