In [70]:
from zipfile import ZipFile
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [71]:
# import cupy 
# import cudf

In [72]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')
df_eval = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'evaluation.csv')
df = df_dev.append(df_eval)

## Preprocessing

Removal of '&amp'

In [73]:
def text_noAmpEnt(df, field_IN, field_OUT):
    # df[field_OUT] = df[field_IN].apply(lambda x : [i.replace('&amp;','') for i in x])
    df[field_OUT] = df[field_IN].str.replace('&amp;','',case=False)
    return df

Removal of '&quot'

In [74]:
def text_noQuotEnt(df, field_IN, field_OUT):
    # df[field_OUT] = df[field_IN].apply(lambda x : [i for i in x if '&quot;' not in i])
    df[field_OUT] = df[field_IN].str.replace('&quot;','',case=False)
    return df

Removal of '@words'

In [75]:
def text_noAt(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('@'))])\
        .str.join(' ')
    return df

Removal of '&words'

In [76]:
def text_noAmp(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('&'))])\
        .str.join(' ')
    return df

Removal of 'http:words'

In [77]:
def text_noHttp(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('http'))])\
        .str.join(' ')
    return df

Removal of repeated letters

In [78]:
import re 
def text_noDuplLetters(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [re.sub(r'(.)\1+', r'\1', word) for word in x])\
        .str.join(' ')
    return df


Remove puntuation

In [79]:
import string as py_string
def text_noPunctuation(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN].str.translate(str.maketrans('', '', py_string.punctuation))
    return df

Stemming and lemmatization

In [96]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import download as nltk_download
nltk_download('wordnet')
nltk_download('omw-1.4')

def text_stem(df, field_IN, field_OUT, stemmer):
    if stemmer == SnowballStemmer:
        stemmer_to_use = SnowballStemmer('english')
    else: 
        stemmer_to_use = stemmer()
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [stemmer_to_use.stem(word) for word in x])\
        .str.join(' ')
    return df

def text_lemm(df, field_IN, field_OUT, lemmatizer):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [lemmatizer().lemmatize(word) for word in x])\
        .str.join(' ')
    return df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Negation

In [81]:
from nltk.sentiment.util import mark_negation
def text_neg(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : mark_negation(x))\
        .str.join(' ')
    return df

User manual filter

In [82]:
# df = df.loc[
#     (~(df['user'] == 'lost_dog') &
#     ~(df['user'] == 'webwoke') &
#     ~(df['user'] == 'tweetpet') &
#     ~(df['user'].str.contains('tweeteradder')) &
#     ~(df['user'].str.contains('tweetfollow')) &
#     ~(df['user'] == 'divxdownloads')) |
#     df['sentiment'].isna()
# ]
# df[df['sentiment'].isna()].shape

## Model creation

In [83]:
from sklearn.model_selection import ParameterGrid

linearSVC_params = {
    'penalty' : ['l1','l2'], #['l1','l2']
    'dual' : [False],
    'tol' : [1e-2,1e-3],
    'fit_intercept' : [False],
    'class_weight' : ['balanced'],
    'max_iter' : [50,100],
    'random_state' : [42],
    'C' : [1,10]
}

TfidfVectorizer_params = {
    'stop_words' : [None],
    'ngram_range' : [(1,3)],
    'max_features' : [None],
    'max_df' : [1.0,0.001],
    'min_df' : [1,0.000001],
    'binary' : [True,False],
    'norm' : ['l1','l2'],
    'use_idf' : [True],
    'smooth_idf' : [True,False],
    'sublinear_tf' : [False]
}


number_different_configurations = len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params)))

print(number_different_configurations, number_different_configurations*30/60/60)

512 4.266666666666667


# Grid search

In [84]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score, classification_report, confusion_matrix

# df_final = df
# # df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))
# df_final['text_final'] = df_final['text_neg'].apply(lambda x : ' '.join(x))

# mask_train_test = df_final['sentiment'].notna()

# X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
# y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
# X_test = df_final.loc[~mask_train_test,:]['text_final'].values

# X_train, X_valid, y_train, y_valid = train_test_split(
#     X_train_valid, 
#     y_train_valid, 
#     shuffle=True, 
#     # stratify=y_train_valid, 
#     train_size=0.9, 
#     random_state=42
# )

# vectorizer = 'tfidf'
# model = 'linearSVC'

# pipe = Pipeline([
#     (vectorizer, TfidfVectorizer()),
#     (model, LinearSVC())
# ])

# def params_for_GridSearchCV(params_IN,step_name):
#     return {f'{step_name}__{key}':value for (key,value) in params_IN.items()}

# params_all = params_for_GridSearchCV(TfidfVectorizer_params,vectorizer)
# params_all.update(params_for_GridSearchCV(linearSVC_params,model))
# print(params_all)

# gscv = GridSearchCV(pipe, params_all, cv = 3, verbose = 4, scoring='f1_macro', n_jobs=6)

# gscv.fit(X_train, y_train)

# f1 = f1_score(y_valid, gscv.predict(X_valid),average='macro')
# report = classification_report(y_valid, gscv.predict(X_valid))
# confusion = confusion_matrix(y_valid, gscv.predict(X_valid))

# print(f1)
# print(report)
# print(confusion)

# from pathlib import Path
# from datetime import datetime

# results_name = Path.cwd()/'gscv_results'/'linearSVC_final_results.csv'

# pd.DataFrame(gscv.cv_results_).to_csv(results_name)

In [85]:
# results = pd.read_csv('gscv_results\\linearSVC_final_results2.csv')
# print(results.loc[results['rank_test_score']==1]['params'].iloc[0])
# print(results.loc[results['rank_test_score']==1]['params'].iloc[1])

# Best models test

In [86]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score, classification_report, confusion_matrix

# df_final = df
# # df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))
# df_final['text_final'] = df_final['text_neg'].apply(lambda x : ' '.join(x))

# mask_train_test = df_final['sentiment'].notna()

# X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
# y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
# X_test = df_final.loc[~mask_train_test,:]['text_final'].values

# X_train, X_valid, y_train, y_valid = train_test_split(
#     X_train_valid, 
#     y_train_valid, 
#     shuffle=True, 
#     # stratify=y_train_valid, 
#     train_size=0.9, 
#     random_state=42
# )

# vectorizer = TfidfVectorizer(
#     binary = True, 
#     max_df = 1.0, 
#     min_df = 1, 
#     max_features = None, 
#     ngram_range = (1,1), 
#     norm = 'l1', 
#     smooth_idf = True, 
#     stop_words = None, 
#     sublinear_tf = False, 
#     use_idf = True
# )

# model = LinearSVC(
#     C = 1, 
#     class_weight = 'balanced', 
#     dual = False, 
#     fit_intercept = False, 
#     max_iter = 100, 
#     penalty = 'l1', 
#     random_state = 42, 
#     tol = 0.001
# )

# pipe = Pipeline([
#     ('tfidf', vectorizer),
#     ('linearSVC', model)
# ])

# pipe.fit(X_train, y_train)

# f1 = f1_score(y_valid, pipe.predict(X_valid),average='macro')
# report = classification_report(y_valid, pipe.predict(X_valid))
# confusion = confusion_matrix(y_valid, pipe.predict(X_valid))

# print(f1)
# print(report)
# print(confusion)

# MODIFIED: norm and ngram_range

In [104]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

# text_noAmpEnt
# text_noQuotEnt
# text_noAt
# text_noAmp
# text_noHttp
# text_noDuplLetters
# text_noPunctuation
# text_stem
# text_lemm
# text_neg

df_final = df.copy(deep=True)
df_final = df_final\
    .pipe(text_noAmpEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noQuotEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noAt, field_IN='text', field_OUT='text')\
    .pipe(text_noHttp, field_IN='text', field_OUT='text')\
    .pipe(text_noPunctuation, field_IN='text', field_OUT='text')\
    .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=SnowballStemmer)\
    .pipe(text_neg, field_IN='text', field_OUT='text')
    # .pipe(text_lemm, field_IN='text', field_OUT='text', lemmatizer=WordNetLemmatizer)
    # .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=LancasterStemmer)

df_final['text_final'] = df_final['text']

mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    # stratify=y_train_valid, 
    train_size=0.9, 
    random_state=42
)

model = LinearSVC(
    C = 1, 
    class_weight = 'balanced', 
    dual = False, 
    fit_intercept = False, 
    max_iter = 100, 
    penalty = 'l2', 
    random_state = 42, 
    tol = 0.001
)

vectorizer = TfidfVectorizer(
    binary = True, 
    max_df = 1.0, 
    max_features = None, 
    min_df = 1, 
    ngram_range = (1,3), 
    norm = 'l2', 
    smooth_idf = False, 
    stop_words = None, 
    sublinear_tf = False, 
    use_idf = True
)

ml_pipe = Pipeline([
    ('tfidf', vectorizer),
    ('linearSVC', model)
])

ml_pipe.fit(X_train, y_train)

f1 = f1_score(y_valid, ml_pipe.predict(X_valid),average='macro')
report = classification_report(y_valid, ml_pipe.predict(X_valid))
confusion = confusion_matrix(y_valid, ml_pipe.predict(X_valid))

print(f1)
print(report)
print(confusion)

0.806582931090972
              precision    recall  f1-score   support

         0.0       0.78      0.78      0.78      9507
         1.0       0.84      0.84      0.84     12993

    accuracy                           0.81     22500
   macro avg       0.81      0.81      0.81     22500
weighted avg       0.81      0.81      0.81     22500

[[ 7380  2127]
 [ 2120 10873]]


Manually removing users: 0.8019786529108555  
Without manually removing users: 0.8007152526552623

None : 0.7973008380157585  
text_noAmpEnt : 0.806213112745712  
text_noQuotEnt : 0.806582931090972  
text_noAt : 0.7998564194923328 -- saltato  
text_noAmp : 0.7994236544375262 -- saltato  
text_noHttp : 0.7997304233939981 -- saltato  
text_noDuplLetters : 0.8017853157058441 -- saltato  
text_noPunctuation : 0.8020861827824957 --saltato  
text_stem :   
    Porter: 0.8014799266049696  
    Lancaster : 0.7974966469180449  
text_lemm : 0.797157501260997  
text_neg : 0.7973008380157585  

[  
    text_noAmpEnt  
    text_noQuotEnt  
] : 0.806582931090972  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_lemm  
    text_neg  
] : 0.7982539044206275  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_stem(Porter)  
    text_neg  
] : 0.8000375978408897  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_stem(Lancaster)  
    text_neg  
] : 0.7961716118953209  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_stem(Snowball)  
    text_neg  
] : 0.8006420407066549  

Best configuration

In [205]:
model = LinearSVC(
    C = 1, 
    class_weight = 'balanced', 
    dual = False, 
    fit_intercept = False, 
    max_iter = 50, 
    penalty = 'l2', 
    random_state = 42, 
    tol = 0.001
)

vectorizer = TfidfVectorizer(
    binary = True, 
    max_df = 1.0, 
    max_features = None, 
    min_df = 1, 
    ngram_range = (1,3), 
    norm = 'l2', 
    smooth_idf = False, 
    stop_words = None, 
    sublinear_tf = False, 
    use_idf = True
)

+ NO stemming: 0.7953890537516292
+ Snowball: 0.7994796670641988
+ Lancaster: 0.7934227929849921
+ Porter: 0.7982780155296589
+ Wordnet: 0.7942085265529459

Results generation

In [101]:
ml_pipe.fit(X_train_valid, y_train_valid)
y_pred = ml_pipe.predict(X_test)
pd.DataFrame(y_pred.astype(int),columns=['Predicted']).to_csv("output_gram13_final.csv",index_label="Id", header=["Predicted"])