## Assignment 05 -  Sentiment Analysis

In [1]:
#!pip install nltk
#!pip install gensim

##  Importing the Library

In [2]:
import pandas as pd
import re
import nltk
import gensim
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Load the Data

In [5]:
data=pd.read_csv("/content/tweets.csv")

In [6]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [7]:
data.isnull().sum()

Unnamed: 0,0
id,0
label,0
tweet,0


In [8]:
data['label'].unique()

array([0, 1])

In [9]:
len(data)

7920

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


## Cleaning the data

In [11]:
data=data[['label','tweet']]

In [12]:
data

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...
...,...,...
7915,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,0,We would like to wish you an amazing day! Make...
7917,0,Helping my lovely 90 year old neighbor with he...
7918,0,Finally got my #smart #pocket #wifi stay conne...


In [13]:
def preprocess_text(text):
  sentences=re.split(r'[.!?]', text)
  lemmatizer=WordNetLemmatizer()
  processed_sentence=[]
  for sentence in sentences:
    tokens=word_tokenize(sentence)
    tokens=[word.lower() for word in tokens if word.isalpha()]
    tokens=[word for word in tokens if word not in stopwords.words('english')]
    tokens=[lemmatizer.lemmatize(word) for word in tokens]
    if tokens:
      processed_sentence.append(" ".join(tokens))
  return processed_sentence


In [14]:
data["Cleaned_text"]=data['tweet'].apply(lambda x: preprocess_text(x))

In [15]:
data.head()

Unnamed: 0,label,tweet,Cleaned_text
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,"[fingerprint pregnancy test http, android apps..."
1,0,Finally a transparant silicon case ^^ Thanks t...,[finally transparant silicon case thanks uncle...
2,0,We love this! Would you go? #talk #makememorie...,"[love, would go, talk makememories unplug rela..."
3,0,I'm wired I know I'm George I was made that wa...,[wired know george made way iphone cute davent...
4,1,What amazing service! Apple won't even talk to...,"[amazing service, apple wo even talk question ..."


## Splitting the data

In [16]:
x_train, x_test, y_train, y_test= train_test_split(
    data['Cleaned_text'], data['label'],
    test_size=0.2,
    random_state=13
)

In [17]:
x_train

Unnamed: 0,Cleaned_text
1330,"[available tpu ca, check http, co, iphone case..."
6222,[hateapple never work like suppose useless cus...
1365,"[view, travel traveldiaries travelphotography ..."
4924,"[apple lost customer, month ipamini still nigh..."
928,"[invested player match tv amazon prime deal, d..."
...,...
2790,"[http, justinbieber apple ipad iphone may, http]"
7696,"[caught vortex software update o discrepancy, ..."
74,"[happy sunday, samsung phone smile inception w..."
6320,[ask digital camera flash stop working need ca...


In [18]:
x_train.shape

(6336,)

In [19]:
x_test.shape

(1584,)

In [20]:
y_train

Unnamed: 0,label
1330,0
6222,1
1365,0
4924,1
928,0
...,...
2790,0
7696,1
74,0
6320,1


## Vectorizer the Data

In [21]:
model=gensim.models.Word2Vec(x_train, min_count=1)

In [22]:
words= model.wv.index_to_key
words

['twitter',
 'http',
 'instagram',
 'apple',
 'pic',
 'news photography fashion health fail tech ipad iphone funny lol',
 'iphone',
 'gain follower rt must follow follow back follow everyone rts gain iphone sougofollow',
 'would like wish amazing day',
 'sup surf fun capetown funny sexy samsung pic',
 'follow capetownsup instagram http',
 'co',
 'love',
 'rhyme iphone',
 'keep ya thing',
 'hey guy',
 'iphone http',
 'html андроид android game news io apple iphone',
 'look http',
 'android http',
 'sale',
 'android apps unitedstate cute color igers iphoneonly guitarplayer iphone',
 'android apps beautiful cute color igers iphoneonly iphone',
 'ebay',
 'redbubble',
 'prophet husband',
 'new phone',
 'youtube',
 'samsung',
 'apple iphone',
 'html',
 'gain follower rt must follow follow back follow everyone rts gain iphone sougofollow ff',
 'exquisite squishy random charm http',
 'check http',
 'iphone strap toy decor free shop style sale today giveawaypic',
 'happy',
 'fuck',
 'com',
 'mo

In [23]:
model.wv['apple']

array([-8.67612381e-03,  1.11035015e-02,  1.17128494e-03,  4.42312303e-04,
        5.77644119e-03, -4.63615777e-03,  2.65751150e-03,  9.17621981e-03,
        4.17224783e-03, -1.01391105e-02,  1.00548202e-02,  3.03418073e-03,
        3.38136079e-03, -5.46626002e-03,  1.00850817e-02, -2.69720331e-03,
        1.06156180e-02, -5.65441139e-03, -9.56771243e-03,  4.83175786e-03,
        1.87904842e-03, -3.12777306e-03,  1.10531235e-02,  9.35880281e-03,
       -8.65739770e-03,  2.75340956e-03,  6.89980853e-03,  3.38952756e-03,
        5.89843257e-04, -8.02302238e-05,  2.29967805e-03, -5.35097392e-03,
       -7.73315132e-03, -3.15018045e-03,  2.51408131e-03,  9.60680377e-03,
        1.18148755e-02, -6.93685049e-03, -9.72571597e-03,  8.56227800e-03,
        2.91755563e-03,  4.51191235e-03,  5.62527031e-03, -3.12775187e-03,
        8.30421969e-03,  2.31963862e-03,  1.65905594e-03, -3.16052046e-03,
       -2.88119004e-03, -9.26915323e-04,  6.11332059e-03, -1.22857175e-03,
       -1.10851834e-02, -

In [24]:
x_train_vec = []
for ls in x_train:
  vectors=[]
  for word in ls:
    if word in words:
      vectors.append(model.wv[word])
  x_train_vec.append(np.array(vectors))


In [25]:
x_test_vec=[]
for ls in x_test:
  vector=[]
  for word in ls:
    if word in words:
      vector.append(model.wv[word])
  x_test_vec.append(np.array(vector))

In [26]:
x_train_vec_avg=[]
for v in x_train_vec:
  if v.size:
    x_train_vec_avg.append(v.mean(axis=0))
  else:
    x_train_vec_avg.append(np.zeros(100, dtype=float))

In [27]:
x_test_vec_avs=[]
for v in x_test_vec:
  if v.size:
    x_test_vec_avs.append(v.mean(axis=0))
  else:
    x_test_vec_avs.append(np.zeros(100,dtype=float))

## Creating the model

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train_vec_avg, y_train)

In [29]:
from sklearn.metrics import accuracy_score
y_pred=rf.predict(x_test_vec_avs)
accuracy_score(y_test, y_pred)

0.759469696969697

## Another way of doing


In [50]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


In [39]:
clf=Pipeline([
    ("Vectorizer", TfidfVectorizer()),
    ("Random Forest", (RandomForestClassifier()))
])

In [40]:
print(type(x_train[0]))
print(x_train[0])

<class 'str'>
available tpu ca check http co iphone case music discount


# changing the list into string for the TfidfVectorizer

In [47]:
x_train = [' '.join(tokens) if isinstance(tokens, list) else tokens for tokens in x_train]


In [48]:
x_test = [' '.join(tokens) if isinstance(tokens, list) else tokens for tokens in x_test]

In [42]:
clf.fit(x_train, y_train)

In [49]:
y_pred=clf.predict(x_test)

In [51]:
print(accuracy_score(y_test, y_pred))

0.8825757575757576


In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1188
           1       0.77      0.76      0.76       396

    accuracy                           0.88      1584
   macro avg       0.84      0.84      0.84      1584
weighted avg       0.88      0.88      0.88      1584

