#Importing Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Importing Dataset

In [4]:
data = pd.read_csv('/content/tweets.csv',index_col='id')
print(data)

      label                                              tweet
id                                                            
1         0  #fingerprint #Pregnancy Test https://goo.gl/h1...
2         0  Finally a transparant silicon case ^^ Thanks t...
3         0  We love this! Would you go? #talk #makememorie...
4         0  I'm wired I know I'm George I was made that wa...
5         1  What amazing service! Apple won't even talk to...
...     ...                                                ...
7916      0  Live out loud #lol #liveoutloud #selfie #smile...
7917      0  We would like to wish you an amazing day! Make...
7918      0  Helping my lovely 90 year old neighbor with he...
7919      0  Finally got my #smart #pocket #wifi stay conne...
7920      0  Apple Barcelona!!! #Apple #Store #BCN #Barcelo...

[7920 rows x 2 columns]


In [5]:
pd.set_option('display.max_colwidth', None)

data.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


# Creating  Functions for text processing

In [6]:
# Removing punctuations

def remove_punctuations(document):
    '''Function to remove punctuations in a document.
    (here document corresponds to a single review)'''

    punc_free = []
    for letter in document:
        if letter not in string.punctuation:
            punc_free.append(letter)
    return ''.join(punc_free)  #creating  the document without punctuations

In [7]:
# Tokenization

def tokenise(document):
    ''' Converts a document/review to a list of words'''
    tokens = nltk.word_tokenize(document)

    return tokens  #returns a list of words

In [8]:
# Removal of stopwords

stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(words_list):
    '''Function to remove the stop words from a list of words after tokenization'''

    stopword_removed = []
    for word in words_list:
        if word not in stopwords:
            stopword_removed.append(word)

    return stopword_removed #returns a list of words

In [9]:
# Lemmatization

def lemmatizer(words_list):
    '''Function to lemmatize a list of words'''
    lemmatized = []
    for word in words_list:
        lemmatized.append(nltk.WordNetLemmatizer().lemmatize(word))

    return lemmatized  #returns a list of lemmatized words


In [10]:
# Combining all the functions

def text_preprocessor(corpus):

    preprocessed = []

    for document in corpus:
        new_doc = remove_punctuations(document)
        new_doc = new_doc.lower()
        words_list = tokenise(new_doc)
        words_list = remove_stopwords(words_list)
        words_list = lemmatizer(words_list)

        preprocessed.append(' '.join(words_list))

    return preprocessed

# Preprocessing

In [11]:

data.isna().sum()

Unnamed: 0,0
label,0
tweet,0


In [12]:
data.shape

(7920, 2)

In [13]:
data['preprocessed'] = text_preprocessor(data['tweet'])

In [14]:
print(data)

      label  \
id            
1         0   
2         0   
3         0   
4         0   
5         1   
...     ...   
7916      0   
7917      0   
7918      0   
7919      0   
7920      0   

                                                                                                                                       tweet  \
id                                                                                                                                             
1           #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone   
2        Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/   
3                We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu   
4                           I'm wired I know I'm George I was made that way ;) #ipho

# Creating BOW Model

In [23]:
x_bow = CountVectorizer().fit_transform(data['preprocessed'])
y = data['label']

In [24]:
#Train Test Split
x_train_bow , x_test_bow, y_train, y_test = train_test_split(x_bow,y,test_size = 0.2, random_state = 30)

In [25]:
#Fitting the data to a model
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(x_train_bow,y_train)

In [26]:
#Prediction and Checking accuracy
y_pred_bow = classifier.predict(x_test_bow)

In [27]:
accuracy_score(y_pred_bow,y_test)

0.8737373737373737

# TF-IDF Model

# Creating Word 2 Vec Model

In [32]:
x = data['tweet'].apply(lambda x: simple_preprocess(x))
y = data['label']

In [33]:
print(x)

id
1                 [fingerprint, pregnancy, test, https, goo, gl, mfqv, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]
2                  [finally, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, sonyexperias, http, instagram, com, yget, jc, jm]
3                       [we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect, http, fb, me, lsupcu]
4                                                  [wired, know, george, was, made, that, way, iphone, cute, daventry, home, http, instagr, am, li_]
5                    [what, amazing, service, apple, won, even, talk, to, me, about, question, have, unless, pay, them, for, their, stupid, support]
                                                                            ...                                                                     
7916                                    [live, out, loud, lol, liveoutloud, selfie, smile, sony, music,

###

In [34]:
#Applying W2V
w2v = Word2Vec(x, min_count=1)

In [35]:
for i in range(10):
    print(w2v.wv.index_to_key[i], w2v.wv[i])   #index to key returns the list of words corresponding to keys

iphone [-0.7075126   1.0667526   0.10219511  0.4134753   0.3291784  -1.167353
  0.8406994   3.0289958  -0.9716496  -1.1035286  -0.38628516 -1.7293599
 -0.04680045  0.5034943   0.6019397  -0.39447215  0.5862596  -0.5541796
 -0.75063515 -2.478635    0.6518495   0.26246414  0.4430258  -1.1248001
 -0.33226871 -0.32045987 -1.0563687  -1.0577935  -0.13691929  0.4438361
  1.2842072  -0.5773317   0.79315263 -1.6264483  -0.45915672  1.1814289
 -0.05243497 -0.3533681  -0.4255706  -2.0459247  -0.22642562 -0.7329858
  0.02673694  0.667948    0.22948372 -0.02875256 -0.904885   -0.01707415
  0.5502244   1.1634775   0.40261093 -1.087774   -0.5879634  -0.0483977
 -0.32172078  0.08655506  0.7757845  -0.23647998 -0.46207592  0.6690599
  0.7053063   0.76335526 -0.13626605  0.11113198 -1.5644227   1.2943697
  0.16713907  1.1242101  -1.027551    0.92003435 -0.46760184  0.8338782
  1.3053452  -0.45496446  1.3331914  -0.20583268  0.33388707  0.45990157
 -0.7161988   0.1654799  -1.1178607   0.14314234 -0.9632

In [36]:

words= w2v.wv.index_to_key
x_w2v = np.array([np.array([w2v.wv[word] for word in doc if word in words]).mean(axis = 0) for doc in x])
x_w2v.shape



(7920, 100)

In [37]:
#Train test split
x_train_w2v , x_test_w2v, y_train, y_test = train_test_split(x_w2v,y,test_size = 0.2, random_state = 42)

In [38]:
#Model fitting and Prediction
classifier.fit(x_train_w2v,y_train)
y_pred = classifier.predict(x_test_w2v)

In [39]:
accuracy_score(y_pred,y_test)

0.8667929292929293

In [40]:
x_tfidf = TfidfVectorizer().fit_transform(data['preprocessed'])
y = data['label']

In [41]:
#Train test split
x_train_tfidf , x_test_tfidf, y_train, y_test = train_test_split(x_tfidf,y,test_size = 0.2, random_state =42 )

In [42]:
#Model fitting and prediction
classifier.fit(x_train_tfidf,y_train)
y_pred = classifier.predict(x_test_tfidf)

In [43]:
accuracy_score(y_pred,y_test)

0.8693181818181818