In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import spacy
import string
import warnings
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from autocorrect import Speller
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from catboost import CatBoostClassifier
import pymorphy2
from textblob import TextBlob
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/lastsign/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lastsign/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lastsign/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/lastsign/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df_positive = pd.read_csv("data/p00_tweets/processedNegative.csv", squeeze=True).T.reset_index()
df_neutral = pd.read_csv("data/p00_tweets/processedNeutral.csv", squeeze=True).T.reset_index()
df_negative = pd.read_csv("data/p00_tweets/processedPositive.csv", squeeze=True).T.reset_index()
df_positive.head()

Unnamed: 0,index
0,How unhappy some dogs like it though
1,talking to my over driver about where I'm goin...
2,Does anybody know if the Rand's likely to fall...
3,I miss going to gigs in Liverpool unhappy
4,There isnt a new Riverdale tonight ? unhappy


In [3]:
df_positive['sentiments'] = 1
df_neutral['sentiments'] = 0
df_negative['sentiments'] = -1

In [4]:
df = pd.concat([df_negative, df_positive, df_neutral], axis=0, ignore_index=True)
df.columns = ['tweets','sentiments']
df['sentiments'].value_counts()

 0    1570
-1    1186
 1    1117
Name: sentiments, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3873 entries, 0 to 3872
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweets      3873 non-null   object
 1   sentiments  3873 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 60.6+ KB


## Data preparation

In [6]:
def remove_symbols(text):
    # remove hyperlinks
    document = re.sub(r'http\S+', '', text.lower())
    # remove hashtags symbols
    document = re.sub(r'#', '', document)        
    ## remove repeating characters
    pattern = re.compile(r'(.)\1{2,}', re.DOTALL)
    document = pattern.sub(r"\1\1", document)
    ## remove usernames
    document = re.sub(r'@\S+', '', document)
    ## remove all digits and numbers
    document = re.sub(r'\d+', '', document)
    # remove special symbols 
    document = re.sub(r'[^\w\s]', '', document)
    # replace few spaces to a single one
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    return document

In [7]:
df['tweets'] = df['tweets'].apply(remove_symbols)
df = df.drop_duplicates()

In [8]:
df['sentiments'].value_counts()

 0    1472
-1     979
 1     975
Name: sentiments, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['sentiments'], test_size=0.2, random_state=0)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2740,), (686,), (2740,), (686,))

In [11]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
spell = Speller(lang='en')

def tokenize_tweet(tweet):
    tokens = tknzr.tokenize(tweet)
    return tokens

def stemming(tokens):
    return [ps.stem(token) for token in tokens]

def lemmanization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def misspelling(tokens):
    return [spell(token) for token in tokens]

def stop_words_remove(tokens):
    return [token for token in tokens if not token.lower() in stop_words]

def preprocessing(tweet, add_mothods):
    data = tokenize_tweet(tweet)
    for method in add_mothods:
        data = method(data)
    return data

def get_tokenizer(methods):
    def wrapper(tweet):
        return preprocessing(tweet, methods)
    
    return wrapper

In [12]:
preproc_methods = {
    'tokenize': [],
    'stemming': [stemming],
    'lemmanization': [lemmanization],
    'stemming_stop_words_remove': [stemming, stop_words_remove],
    'misspelling': [misspelling],
    'lemmanization_misspelling': [lemmanization, misspelling]
}

In [13]:
def preprocessCV(methods: dict):
    best_accuracy_bin_vec = 0
    best_accuracy_count_vec = 0
    best_accuracy_tfidf_vec = 0
    results = {}
    for key, methods in methods.items():
        
        tokenizer_method = get_tokenizer(methods)
        
        clf = CatBoostClassifier(verbose=False)
        
        bin_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer_method(x), binary=True)
        count_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer_method(x))
        tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer_method(x))
        
        grid_pipeline_bin_vec = Pipeline([
            ('vectorizer', bin_vectorizer),
            ('model', clf)
        ])
        
        grid_pipeline_count_vec = Pipeline([
            ('vectorizer', count_vectorizer),
            ('model', clf)
        ])
        
        grid_pipeline_tfidf_vec = Pipeline([
            ('vectorizer', tfidf_vectorizer),
            ('model', clf)
        ])
        
        grid_pipeline_bin_vec.fit(X_train, y_train)
        grid_pipeline_count_vec.fit(X_train, y_train)
        grid_pipeline_tfidf_vec.fit(X_train, y_train)

        
        y_pred_bin_vec = grid_pipeline_bin_vec.predict(X_test)
        y_pred_count_vec = grid_pipeline_count_vec.predict(X_test)
        y_pred_tfidf_vec = grid_pipeline_tfidf_vec.predict(X_test)
        
        accuracy_bin_vec = accuracy_score(y_test, y_pred_bin_vec)
        accuracy_count_vec = accuracy_score(y_test, y_pred_count_vec)
        accuracy_tfidf_vec = accuracy_score(y_test, y_pred_tfidf_vec)
        
        if accuracy_bin_vec > best_accuracy_bin_vec:
            best_accuracy_bin_vec = accuracy_bin_vec
            results['accuracy_bin_vec'] = accuracy_bin_vec
            results['methods_names'] = key
            results['model_bin_vec'] = grid_pipeline_bin_vec
            
        if accuracy_count_vec > best_accuracy_count_vec:
            best_accuracy_count_vec = accuracy_count_vec
            results['accuracy_count_vec'] = accuracy_count_vec
            results['methods_names'] = key
            results['model_count_vec'] = grid_pipeline_count_vec

        if accuracy_tfidf_vec > best_accuracy_tfidf_vec:
            best_accuracy_tfidf_vec = accuracy_tfidf_vec
            results['accuracy_tfidf_vec'] = accuracy_tfidf_vec
            results['methods_names'] = key
            results['model_tfidf_vec'] = grid_pipeline_tfidf_vec
        print(f'{key} scores {accuracy_bin_vec}; {accuracy_count_vec}; {accuracy_tfidf_vec}')
    return results

In [24]:
%%time
res = preprocessCV(preproc_methods)

tokenize scores 0.8731778425655977; 0.8731778425655977; 0.8644314868804664
stemming scores 0.8615160349854227; 0.8615160349854227; 0.8600583090379009
lemmanization scores 0.8688046647230321; 0.8688046647230321; 0.8688046647230321
stemming_stop_words_remove scores 0.8513119533527697; 0.8513119533527697; 0.8454810495626822
misspelling scores 0.8746355685131195; 0.8746355685131195; 0.8702623906705539
lemmanization_misspelling scores 0.8717201166180758; 0.8717201166180758; 0.8615160349854227
CPU times: user 20min 19s, sys: 2min 13s, total: 22min 33s
Wall time: 6min 56s


In [25]:
res

{'accuracy_bin_vec': 0.8746355685131195,
 'methods_names': 'misspelling',
 'model_bin_vec': Pipeline(steps=[('vectorizer',
                  CountVectorizer(binary=True,
                                  tokenizer=<function preprocessCV.<locals>.<lambda> at 0x7f5c657dc3a0>)),
                 ('model',
                  <catboost.core.CatBoostClassifier object at 0x7f5c65728dc0>)]),
 'accuracy_count_vec': 0.8746355685131195,
 'model_count_vec': Pipeline(steps=[('vectorizer',
                  CountVectorizer(tokenizer=<function preprocessCV.<locals>.<lambda> at 0x7f5b2af50af0>)),
                 ('model',
                  <catboost.core.CatBoostClassifier object at 0x7f5c65728dc0>)]),
 'accuracy_tfidf_vec': 0.8702623906705539,
 'model_tfidf_vec': Pipeline(steps=[('vectorizer',
                  TfidfVectorizer(tokenizer=<function preprocessCV.<locals>.<lambda> at 0x7f5b2af50c10>)),
                 ('model',
                  <catboost.core.CatBoostClassifier object at 0x7f5c65728dc0

In [32]:
%%time
texts_diff_prep = {}
for key, methods in preproc_methods.items():
        tokenizer_method = get_tokenizer(methods)        
        tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer_method(x), ngram_range=(1, 3), max_df=0.5, max_features=1000)
        texts_diff_prep[key] = tfidf_vectorizer.fit_transform(df['tweets'])

for key in texts_diff_prep.keys():
    cosine_sim_tweets = cosine_similarity(texts_diff_prep[key])

    cosine_sim_dict = {}
    for i in range(len(cosine_sim_tweets)):
        for j in range(len(cosine_sim_tweets)):
            if i > j:
                if cosine_sim_tweets[i][j] < 1. and cosine_sim_tweets[i][j] != 0.:
                    cosine_sim_dict[f'{cosine_sim_tweets[i][j]}'] = (i, j)
                    
    cosine_sim_val = list(cosine_sim_dict.keys())
    sorted_cosine_sim_val = np.sort(cosine_sim_val, kind='mergesort')
    
    for i, val in enumerate(sorted_cosine_sim_val[-10:]):
        print(f'cosines {val} for method {key}')

        left_cosine = df['tweets'].iloc[cosine_sim_dict[val][0]]
        right_cosine = df['tweets'].iloc[cosine_sim_dict[val][1]]
        print(f'{left_cosine}\n{right_cosine}')
    
    print('--------------------------------------------------------------------------------------------------')

cosines 0.9777584952731331 for method tokenize
share the love high value members of this week happy insight by
share the love high value members of this week happy
cosines 0.9783643251697532 for method tokenize
thanks for the recent follow happy to connect happy have a great thursday get free
thanks for the recent follow happy to connect happy have a great thursday get this
cosines 0.9789324358406472 for method tokenize
i miss so much unhappy 
i miss temperance brennan and seeley booth so much unhappy 
cosines 0.9802677095775874 for method tokenize
hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont
hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for cont
cosines 0.9834113250021594 for method tokenize
thanks for the recent follow much appreciated happy want this
thanks for the recent follow much appreciated happy want this for its magi

cosines 0.978367413534302 for method misspelling
thanks for the recent follow happy to connect happy have a great thursday get free
thanks for the recent follow happy to connect happy have a great thursday get this
cosines 0.978950756965837 for method misspelling
i miss so much unhappy 
i miss temperance brennan and seeley booth so much unhappy 
cosines 0.9803339716150034 for method misspelling
hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont
hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for cont
cosines 0.981725257365042 for method misspelling
share the love high value members of this week happy insight by
share the love high value members of this week happy
cosines 0.9833954950715944 for method misspelling
thanks for the recent follow much appreciated happy want this
thanks for the recent follow much appreciated happy want this 

In [16]:
import gensim.downloader as api
embeddings_pretrained = api.load('glove-twitter-25')

In [17]:
from gensim.models import Word2Vec

In [18]:
def preproc_nltk(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in tknzr.tokenize(text.lower()) if word not in stop_words])

In [19]:
proc_words = [preproc_nltk(text).split() for text in df['tweets']]
embeddings_trained = Word2Vec(proc_words, # data for model to train on
                 vector_size=100,                 # embedding vector size
                 min_count=3,             # consider words that occured at least 5 times
                 window=3).wv

In [20]:
def vectorize_sum(comment, embeddings):
    """
    implement a function that converts preprocessed comment to a sum of token vectors
    """
    embedding_dim = embeddings.vectors.shape[1]
    features = np.zeros([embedding_dim], dtype='float32')

    for word in preproc_nltk(comment).split():
        if word in embeddings:
            features += embeddings[f'{word}']
    
    return features

In [21]:
len(embeddings_trained.index_to_key)

1289

In [22]:
X_wv = np.stack([vectorize_sum(text, embeddings_pretrained) for text in df['tweets']])
X_train_wv, X_test_wv, y_train, y_test = train_test_split(X_wv, df['sentiments'], test_size=0.2, random_state=0)
X_train_wv.shape, X_test_wv.shape

((2740, 25), (686, 25))

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
clf = LogisticRegression(max_iter=5000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

0.8250728862973761

In [25]:
X_wv = np.stack([vectorize_sum(text, embeddings_trained) for text in df['tweets']])
X_train_wv, X_test_wv, y_train, y_test = train_test_split(X_wv, df['sentiments'], test_size=0.2, random_state=0)
X_train_wv.shape, X_test_wv.shape

((2740, 100), (686, 100))

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
clf = LogisticRegression(max_iter=10000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

0.7842565597667639

In [28]:
clf = CatBoostClassifier(verbose=False)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

0.8279883381924198

In [30]:
svc = svm.SVC()
wv_model = svc.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

0.8061224489795918

In [31]:
sgd = SGDClassifier()
wv_model = sgd.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

0.8017492711370262