In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_palette('muted')
sns.set_color_codes('muted')
sns.set_style('white')

import warnings
warnings.filterwarnings('ignore')

In [2]:
%config InlineBackend.figure_format = 'retina'

Group: Bakalets D., Nesterov M., Shagieva Z. and Fomich A.

Competition and task description are available on [kaggle](https://www.kaggle.com/competitions/gsom-23sm1-ml-hometask-2).

## Dataset

In [3]:
train = pd.read_csv('corona_train.csv', encoding='ISO-8859-1', index_col=0)

In [4]:
test= pd.read_csv('corona_test.csv', encoding='ISO-8859-1', index_col=0)

In [5]:
train['type']='train'
test['type']='test'
test['Sentiment']=''

In [6]:
train=train.append(test)

## Check duplicates

In [7]:
train.drop_duplicates(subset='OriginalTweet').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44955 entries, 0 to 17981
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       44955 non-null  int64 
 1   ScreenName     44955 non-null  int64 
 2   Location       35531 non-null  object
 3   TweetAt        44955 non-null  object
 4   OriginalTweet  44955 non-null  object
 5   Sentiment      44955 non-null  object
 6   type           44955 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.7+ MB


### Text correction

In [122]:
# Data cleaning 
import re

def tweet_cleaner(tweet):
    
    # remove urls
    tweet = re.sub(r'http\S+', ' ', tweet)
    
    # remove html tags
    #tweet = re.sub(r'<.*?>', ' ', tweet)
    
    # remove digits
    #tweet = re.sub(r'\d+', ' ', tweet)
    
    # remove hashtags
    tweet = re.sub(r'#\w+', ' ', tweet)
    
    # remove mentions
    tweet = re.sub(r'@\w+', ' ', tweet)
    
    # remove whitespaces
    tweet = ' '.join(tweet.split())

    return tweet
    
 
train['OriginalTweet'] = train['OriginalTweet'].apply(lambda x: tweet_cleaner(x)) 
train['CleanTweet'] = train['OriginalTweet'].apply(lambda x: x.replace('\n', ' '))
train['CleanTweet'] = train['CleanTweet'].str.lower()  

In [125]:
# lemmatization
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dariabakalets/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [126]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dariabakalets/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [128]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dariabakalets/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [129]:
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [130]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dariabakalets/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [131]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def lemmatize_text(text):
    # Split the text into words
    words = nltk.word_tokenize(text)
    # Lemmatize each word and join them back into a string
    return ' '.join([wnl.lemmatize(word, get_wordnet_pos(word)) for word in words])

# Apply the lemmatization function to the text data
train['CleanTweet'] = train['CleanTweet'].apply(lemmatize_text)

In [133]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)
train['CleanTweet'] = train['CleanTweet'].apply(lambda x: cleaning_repeating_char(x))

### Model

In [135]:
X_train_text=train[train['type']=='train']['CleanTweet']
X_test_text=train[train['type']=='test']['CleanTweet']
y_train_text=train[train['type']=='train']['Sentiment']

#### Find the best params for pipeline

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from sklearn.svm import SVC

params_grid = dict(
                   C=[1, 10, 100],
                   gamma=[0.1, 0.01, 0.001],
                   kernel=['linear', 'rbf'],
                   min_df=[.0001, .0005, .0007, .001, .005, .01], 
                   max_df=[.7, .75, .8, .85, .9])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps=[('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                                          max_df=params['max_df'],
                                                          min_df=params['min_df'],
                                                          stop_words='english')
                            ),
                           ('classifier', SVC(C=params['C'],
                                               gamma=params['gamma'],
                                               kernel=params['kernel']))
                           ])

    pipe.fit(X_train_text['CleanTweet'], y_train_text)

    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)

    results.append(dict(

        params=params,

        precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),


        recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),


        f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),


        accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),

    ))

In [None]:
results = pd.DataFrame(results)
results.sort_values('accuracy_train', ascending=False).head(10).style.bar(vmin=0, vmax=1)

Use the best params for final calculations 

In [138]:
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
params_grid = dict(min_df=[0.0005],
                   max_df=[0.85],
                   C=[10],
                   gamma=[0.1],
                   kernel=['rbf'])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps=[('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                                          max_df=params['max_df'],
                                                          min_df=params['min_df'],
                                                          stop_words='english')
                            ),
                           ('classifier', SVC(C=params['C'],
                                               gamma=params['gamma'],
                                               kernel=params['kernel']))
                           ])

    pipe.fit(X_train_text, y_train_text)

    pipe_preds_train = pipe.predict(X_train_text)
    pipe_preds_test = pipe.predict(X_test_text)


100%|████████████████████████████████████████████| 1/1 [02:29<00:00, 149.34s/it]


In [None]:
results.append(dict(
params=params,
precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
))

In [141]:
pd.DataFrame(pipe_preds_test).to_csv('result_tweet.csv',sep=',')

In [143]:
results = pd.DataFrame(results)
results.sort_values('accuracy_train', ascending=False).head(10).style.bar(vmin=0, vmax=1)

Unnamed: 0,precision_train,recall_train,f1_train,accuracy_train
0,0.827942,0.811567,0.818528,0.814185
