In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_csv('tweets.csv')
df

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


In [5]:
df['label'].value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [6]:
df.isna().sum()

id       0
label    0
tweet    0
dtype: int64

# Tweets preprocessing

In [7]:
import nltk
import re
import unidecode
from nltk.tokenize.toktok import ToktokTokenizer

In [8]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [9]:
def clean_keywords(word):
    return re.sub(r'%20', ' ', word)
def to_lowercase(word):
    return word.lower()
def remove_accents(word):
    return unidecode.unidecode(word)
def remove_punctuation(word):
    return re.sub(r"[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n -' ]"," ",word)

In [10]:
def cleaning_URLs(word):
    return re.sub('((www.[^s]+)|(https?:\/\/.*?[\s+]))',' ',word)
def remove_mentions(word):
    return re.sub('@[\w]*',' ',word)

In [11]:
#Setting English stopwords
tokenizer1 = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer1.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

Removing all hyperlinks

In [12]:
df['cleaned_tweet'] = df['tweet'].apply(lambda x: cleaning_URLs(x))

Removing and replacing certain patterns

In [13]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_special_characters(x))

Removing @mentions of users

In [14]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_mentions(x))

Removing all special characters

In [15]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_punctuation(x))

Converting everything to unicode characters

In [16]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_accents(x))

Convert everything to lowercase

In [17]:
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: to_lowercase(x))

Removing stopwords using NLTK corpus library

In [18]:
df['final_cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: remove_stopwords(x, True))

In [19]:
df

Unnamed: 0,id,label,tweet,cleaned_tweet,final_cleaned_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beauti...,fingerprint pregnancy test android apps beauti...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks ...,finally transparant silicon case thanks uncle ...
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememories un...,love would go talk makememories unplug relax i...
3,4,0,I'm wired I know I'm George I was made that wa...,im wired i know im george i was made that way ...,im wired know im george made way iphone cute d...
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple wont even talk to m...,amazing service apple wont even talk question ...
...,...,...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...,live out loud lol liveoutloud selfie smile son...,live loud lol liveoutloud selfie smile sony mu...
7916,7917,0,We would like to wish you an amazing day! Make...,we would like to wish you an amazing day make ...,would like wish amazing day make every minute ...
7917,7918,0,Helping my lovely 90 year old neighbor with he...,helping my lovely 90 year old neighbor with he...,helping lovely 90 year old neighbor ipad morni...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...,finally got my smart pocket wifi stay connecte...,finally got smart pocket wifi stay connected a...


# Bag of Words model

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
bow_model = CountVectorizer(stop_words="english", ngram_range=(1,1))
bow_vector = bow_model.fit_transform(df['final_cleaned_tweet']).todense()

In [22]:
bow_df = pd.DataFrame(bow_vector)
bow_df.columns = sorted(bow_model.vocabulary_)
bow_df.head()

Unnamed: 0,000,00000,002,004,0051,007,008,01,010111,0101am,...,zs,zsofimonster,ztjeq,zumies,zune,zunehd,zurich,zv7tuur,zw1ck,zx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [24]:
x_train, x_test, y_train, y_test = train_test_split(bow_df, df['label'], test_size=0.15, random_state=134)

Using Logistic Regression

In [25]:
bow_log = LogisticRegression(fit_intercept=False)
bow_log.fit(x_train, y_train)
y_pred_bow_log = bow_log.predict(x_test)
print("Accuracy score of Bag of words model using logistic regression: " + str(round(accuracy_score(y_test, y_pred_bow_log) * 100, 2)) + "%")

Accuracy score of Bag of words model using logistic regression: 88.05%


Using Decision Tree Classifier

In [26]:
bow_dt = DecisionTreeClassifier()
bow_dt.fit(x_train, y_train)
y_pred_bow_dt = bow_dt.predict(x_test)
print("Accuracy score of Bag of words model using Decision Tree Classifier: " + str(round(accuracy_score(y_test, y_pred_bow_dt) * 100, 2)) + "%")

Accuracy score of Bag of words model using Decision Tree Classifier: 85.69%


Using Gaussian Naive Bayes

In [27]:
bow_gnb = GaussianNB()
bow_gnb.fit(x_train, y_train)
y_pred_bow_gnb = bow_gnb.predict(x_test)
print("Accuracy score of Bag of words model using Gaussian Naive Bayes: " + str(round(accuracy_score(y_test, y_pred_bow_gnb) * 100, 2)) + "%")

Accuracy score of Bag of words model using Gaussian Naive Bayes: 79.21%


# TFIDF Model

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
tfidf = TfidfVectorizer()
tdfif_dense = tfidf.fit_transform(df['final_cleaned_tweet']).todense()

In [30]:
tfidf_df = pd.DataFrame(tdfif_dense)
tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21424,21425,21426,21427,21428,21429,21430,21431,21432,21433
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_df, df['label'], test_size=0.15, random_state=134)

Using Logistic Regression

In [32]:
tfidf_log = LogisticRegression(fit_intercept=False)
tfidf_log.fit(x_train, y_train)
y_pred_tfidf_log = tfidf_log.predict(x_test)
print("Accuracy score of TFIDF model using logistic regression: " + str(round(accuracy_score(y_test, y_pred_tfidf_log) * 100, 2)) + "%")

Accuracy score of TFIDF model using logistic regression: 88.13%


Using Decision Tree Classifier

In [33]:
tfidf_dt = DecisionTreeClassifier()
tfidf_dt.fit(x_train, y_train)
y_pred_tfidf_dt = tfidf_dt.predict(x_test)
print("Accuracy score of TFIDF model using Decision Tree Classifier: " + str(round(accuracy_score(y_test, y_pred_tfidf_dt) * 100, 2)) + "%")

Accuracy score of TFIDF model using Decision Tree Classifier: 83.67%


Using Gaussian Naive Bayes

In [34]:
tfidf_gnb = GaussianNB()
tfidf_gnb.fit(x_train, y_train)
y_pred_tfidf_gnb = tfidf_gnb.predict(x_test)
print("Accuracy score of TFIDF model using Gaussian Naive Bayes: " + str(round(accuracy_score(y_test, y_pred_tfidf_gnb) * 100, 2)) + "%")

Accuracy score of TFIDF model using Gaussian Naive Bayes: 79.38%


# Word Embeddings Models

In [35]:
from gensim.models import Word2Vec as wtv

In [36]:
preprocessed_text = df['cleaned_tweet'].apply(lambda x: x.split())
preprocessed_text

0       [fingerprint, pregnancy, test, android, apps, ...
1       [finally, a, transparant, silicon, case, thank...
2       [we, love, this, would, you, go, talk, makemem...
3       [im, wired, i, know, im, george, i, was, made,...
4       [what, amazing, service, apple, wont, even, ta...
                              ...                        
7915    [live, out, loud, lol, liveoutloud, selfie, sm...
7916    [we, would, like, to, wish, you, an, amazing, ...
7917    [helping, my, lovely, 90, year, old, neighbor,...
7918    [finally, got, my, smart, pocket, wifi, stay, ...
7919    [apple, barcelona, apple, store, bcn, barcelon...
Name: cleaned_tweet, Length: 7920, dtype: object

Creating Cbow & skipgram models

In [37]:
cbow_w2v_model = wtv(preprocessed_text, vector_size=800, window=5, min_count=3, sg=0)
skgram_w2v_model = wtv(preprocessed_text, vector_size=800, window=5, min_count=3, sg=1)

In [38]:
print("cbow vocabulary size:", len(cbow_w2v_model.wv.index_to_key))
print("skipgram vocabulary size:", len(skgram_w2v_model.wv.index_to_key))

cbow vocabulary size: 4153
skipgram vocabulary size: 4153


Function to return average word embedding vector value

In [39]:
def get_embedding_w2v(doc_tokens, model):
    embeddings = []
    for tok in doc_tokens:
      if tok in model.wv.index_to_key:
          embeddings.append(model.wv.get_vector(tok))
    return np.mean(embeddings, axis=0)

### Skipgram model

In [40]:
X_x2v_model = preprocessed_text.apply(lambda x: get_embedding_w2v(x, skgram_w2v_model))
X_df_sg = pd.DataFrame(X_x2v_model.to_list())

In [41]:
x_train, x_test, y_train, y_test = train_test_split(X_df_sg, df['label'], test_size=0.15, random_state=134)

Using Logistic Regression

In [42]:
sg_log = LogisticRegression(fit_intercept=False)
sg_log.fit(x_train, y_train)
y_pred_sg_log = sg_log.predict(x_test)
print("Accuracy score of Skipgram model using logistic regression: " + str(round(accuracy_score(y_test, y_pred_sg_log) * 100, 2)) + "%")

Accuracy score of Skipgram model using logistic regression: 87.71%


Using Decision tree classifier

In [43]:
sg_dt = DecisionTreeClassifier()
sg_dt.fit(x_train, y_train)
y_pred_sg_dt = sg_dt.predict(x_test)
print("Accuracy score of Skipgram model using Decision Tree Classifier: " + str(round(accuracy_score(y_test, y_pred_sg_dt) * 100, 2)) + "%")

Accuracy score of Skipgram model using Decision Tree Classifier: 83.25%


Using Gaussian Naive Bayes

In [44]:
sg_gnb = GaussianNB()
sg_gnb.fit(x_train, y_train)
y_pred_sg_gnb = sg_gnb.predict(x_test)
print("Accuracy score of Skipgram model using Gaussian Naive Bayes: " + str(round(accuracy_score(y_test, y_pred_sg_gnb) * 100, 2)) + "%")

Accuracy score of Skipgram model using Gaussian Naive Bayes: 82.58%


### Cbow model

In [45]:
X_x2v_model = preprocessed_text.apply(lambda x: get_embedding_w2v(x, cbow_w2v_model))
X_df_cbow = pd.DataFrame(X_x2v_model.to_list())

In [46]:
x_train, x_test, y_train, y_test = train_test_split(X_df_cbow, df['label'], test_size=0.15, random_state=134)

In [47]:
cbow_log = LogisticRegression(fit_intercept=False)
cbow_log.fit(x_train, y_train)
y_pred_cbow_log = cbow_log.predict(x_test)
print("Accuracy score of Cbow model using logistic regression: " + str(round(accuracy_score(y_test, y_pred_cbow_log) * 100, 2)) + "%")

Accuracy score of Cbow model using logistic regression: 84.34%


In [48]:
cbow_dt = DecisionTreeClassifier()
cbow_dt.fit(x_train, y_train)
y_pred_cbow_dt = cbow_dt.predict(x_test)
print("Accuracy score of Cbow model using Decision Tree Classifier: " + str(round(accuracy_score(y_test, y_pred_cbow_dt) * 100, 2)) + "%")

Accuracy score of Cbow model using Decision Tree Classifier: 79.97%


In [49]:
cbow_gnb = GaussianNB()
cbow_gnb.fit(x_train, y_train)
y_pred_cbow_gnb = cbow_gnb.predict(x_test)
print("Accuracy score of Cbow model using Gaussian Naive Bayes: " + str(round(accuracy_score(y_test, y_pred_cbow_gnb) * 100, 2)) + "%")

Accuracy score of Cbow model using Gaussian Naive Bayes: 76.68%


# Conclusion

In [50]:
predictions = ((y_pred_bow_log, y_pred_bow_dt, y_pred_bow_gnb),
           (y_pred_tfidf_log, y_pred_tfidf_dt, y_pred_tfidf_gnb),
           (y_pred_sg_log, y_pred_sg_dt, y_pred_sg_gnb),
           (y_pred_cbow_log, y_pred_cbow_dt, y_pred_cbow_gnb))

rounded_accuracy_scores = []
for item in predictions:
    temp = []
    for val in item:
        temp.append(round(accuracy_score(y_test, val) * 100, 2))
    rounded_accuracy_scores.append(temp)

In [51]:
algorithms = ("Logistic Regression", "Decision Tree", "Naive Bayes")
models = ("Bag of Words", "TFIDF", "Skipgram", "Cbow")

results_df = pd.DataFrame(rounded_accuracy_scores, columns=algorithms)
results_df['models'] = models
results_df.insert(0, 'models', results_df.pop("models"))
results_df.set_index('models', inplace=True)

Accuracy scores dataframe

In [52]:
results_df

Unnamed: 0_level_0,Logistic Regression,Decision Tree,Naive Bayes
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bag of Words,88.05,85.69,79.21
TFIDF,88.13,83.67,79.38
Skipgram,87.71,83.25,82.58
Cbow,84.34,79.97,76.68


### From the data the TFIDF and Bag of words models trained using Logistic Regression gives the best accuracy