In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [70]:
df = pd.read_csv('tweets.csv')
df

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


In [72]:
df['label'].value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [73]:
df.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [74]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import unidecode

[nltk_data] Downloading package stopwords to /home/neeraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [76]:
def clean_keywords(word):
    return re.sub(r'%20', ' ', word)
def to_lowercase(word):
    return word.lower()
def remove_accents(word):
    return unidecode.unidecode(word)
def remove_punctuation(word):
    return re.sub(r"[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n -' ]"," ",word)

In [77]:
def cleaning_URLs(word):
    return re.sub('((www.[^s]+)|(https?:\/\/.*?[\s+]))',' ',word)
def remove_mentions(word):
    return re.sub('@[\w]*',' ',word)

In [78]:
df['cleaned_tweet'] = df['tweet'].apply(lambda x: cleaning_URLs(x))

In [79]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_special_characters(x))

In [80]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_mentions(x))

In [81]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_punctuation(x))

In [82]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_accents(x))

In [83]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: to_lowercase(x))

In [84]:
df

Unnamed: 0,id,label,tweet,cleaned_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beauti...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks ...
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememories un...
3,4,0,I'm wired I know I'm George I was made that wa...,im wired i know im george i was made that way ...
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple wont even talk to m...
...,...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...,live out loud lol liveoutloud selfie smile son...
7916,7917,0,We would like to wish you an amazing day! Make...,we would like to wish you an amazing day make ...
7917,7918,0,Helping my lovely 90 year old neighbor with he...,helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...,finally got my smart pocket wifi stay connecte...


In [85]:
from gensim.models import Word2Vec as wtv

In [86]:
preprocessed_text = df['cleaned_tweet'].apply(lambda x: x.split())

In [87]:
preprocessed_text

0       [fingerprint, pregnancy, test, android, apps, ...
1       [finally, a, transparant, silicon, case, thank...
2       [we, love, this, would, you, go, talk, makemem...
3       [im, wired, i, know, im, george, i, was, made,...
4       [what, amazing, service, apple, wont, even, ta...
                              ...                        
7915    [live, out, loud, lol, liveoutloud, selfie, sm...
7916    [we, would, like, to, wish, you, an, amazing, ...
7917    [helping, my, lovely, 90, year, old, neighbor,...
7918    [finally, got, my, smart, pocket, wifi, stay, ...
7919    [apple, barcelona, apple, store, bcn, barcelon...
Name: cleaned_tweet, Length: 7920, dtype: object

In [88]:
cbow_w2v_model = wtv(preprocessed_text, vector_size=300, window=6, min_count=3, sg=0)
skgram_w2v_model = wtv(preprocessed_text, vector_size=300, window=6, min_count=3, sg=1)

In [89]:
print("cbow vocalulary size:", len(cbow_w2v_model.wv.index_to_key))
print("skipgram vocalulary size:", len(skgram_w2v_model.wv.index_to_key))

cbow vocalulary size: 4153
skipgram vocalulary size: 4153


In [90]:
skgram_w2v_model.wv.most_similar('fingerprint')

[('rumors', 0.9972054958343506),
 ('history', 0.9967876076698303),
 ('record', 0.9965904355049133),
 ('preparing', 0.9963428974151611),
 ('sex', 0.9960336685180664),
 ('andriod', 0.9959642887115479),
 ('gb', 0.9958667755126953),
 ('overit', 0.995806097984314),
 ('learning', 0.9957908391952515),
 ('loop', 0.9956794381141663)]

In [91]:
def get_embeddimng_w2v(doc_tokens, model):
    embeddings = []
    for tok in doc_tokens:
      if tok in model.wv.index_to_key:
          embeddings.append(model.wv.get_vector(tok))
    return np.mean(embeddings, axis=0)

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score

## Skipgram model

In [93]:
X_x2v_model = preprocessed_text.apply(lambda x: get_embeddimng_w2v(x, skgram_w2v_model))
X_df_sg = pd.DataFrame(X_x2v_model.to_list())

In [94]:
x_train, x_test, y_train, y_test = train_test_split(X_df_sg, df['label'], test_size=0.15, random_state=134)

In [95]:
log = LogisticRegression()
log.fit(x_train, y_train)

In [96]:
y_pred_log = log.predict(x_test)
print("accuracy score :",accuracy_score(y_test, y_pred_log))

accuracy score : 0.8872053872053872


In [97]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [98]:
y_pred_gnb = gnb.predict(x_test)
accuracy_score(y_test, y_pred_gnb)

0.8291245791245792

# Cbow model

In [99]:
X_x2v_model = preprocessed_text.apply(lambda x: get_embeddimng_w2v(x, cbow_w2v_model))
X_df_cbow = pd.DataFrame(X_x2v_model.to_list())

In [100]:
x_train, x_test, y_train, y_test = train_test_split(X_df_cbow, df['label'], test_size=0.15, random_state=134)

In [101]:
log2 = LogisticRegression()
log2.fit(x_train, y_train)

In [102]:
y_pred_log2 = log2.predict(x_test)
print("accuracy score :",accuracy_score(y_test, y_pred_log2))

accuracy score : 0.8476430976430976


In [103]:
gnb2 = GaussianNB()
gnb2.fit(x_train, y_train)

In [104]:
y_pred_gnb2 = gnb2.predict(x_test)
accuracy_score(y_test, y_pred_gnb2)

0.7777777777777778