In [42]:
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Data import

In [43]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')
df_eval = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'evaluation.csv')
df = df_dev.append(df_eval)

# Preprocessing functions

Removal of '&amp\;'

In [44]:
def text_noAmpEnt(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN].str.replace('&amp;','',case=False)
    return df

Removal of '&quot\;'

In [45]:
def text_noQuotEnt(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN].str.replace('&quot;','',case=False)
    return df

Removal of '@words'

In [46]:
def text_noAt(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('@'))])\
        .str.join(' ')
    return df

Removal of 'http:words'

In [47]:
def text_noHttp(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('http'))])\
        .str.join(' ')
    return df

Removal of repeated letters

In [48]:
import re 
def text_noDuplLetters(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [re.sub(r'(.)\1+', r'\1', word) for word in x])\
        .str.join(' ')
    return df


Removal of puntuation

In [49]:
import string as py_string
def text_noPunctuation(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN].str.translate(str.maketrans('', '', py_string.punctuation))
    return df

Item stemming

In [50]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

def text_stem(df, field_IN, field_OUT, stemmer):
    if stemmer == SnowballStemmer:
        stemmer_to_use = SnowballStemmer('english')
    else: 
        stemmer_to_use = stemmer()
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [stemmer_to_use.stem(word) for word in x])\
        .str.join(' ')
    return df

Word negations

In [51]:
from nltk.sentiment.util import mark_negation
def text_neg(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : mark_negation(x))\
        .str.join(' ')
    return df

User suspiciousness

In [52]:
from sklearn.preprocessing import minmax_scale
average_user_sentiment = df_dev.groupby('user')['sentiment'].mean()
average_user_sentiment = pd.Series(pd.DataFrame(average_user_sentiment)['sentiment'])
extreme_sentiment = np.abs(0.5-average_user_sentiment)
extreme_sentiment = pd.Series(minmax_scale(extreme_sentiment.values),index=extreme_sentiment.index)
print(f'extreme_sentiment: {extreme_sentiment.max(), extreme_sentiment.min()}')

extreme_sentiment: (1.0, 0.0)


In [53]:
from sklearn.feature_extraction.text import CountVectorizer
countvect = CountVectorizer()
df_test = df.copy(deep=True)
X_count = countvect.fit_transform(df_test['text'])

from sklearn.metrics.pairwise import cosine_similarity
user_similarity = {}
for user in df_test['user'].unique():
    X_count_user = X_count[df_test['user'] == user,:]
    X_similarity_user = cosine_similarity(X_count_user)
    np.fill_diagonal(X_similarity_user,np.nan)
    user_similarity[user] = np.nanmean(X_similarity_user)

user_similarity = pd.Series(user_similarity)
user_similarity = pd.Series(minmax_scale(user_similarity.values),index=user_similarity.index)
print(f'user_similarity: {user_similarity.max(), user_similarity.min()}')

user_similarity: (1.0, 0.0)


In [54]:
user_suspiciousness = (extreme_sentiment*user_similarity)
print(f'user_suspiciousness: {user_suspiciousness.max(), user_suspiciousness.min()}')

user_suspiciousness: (1.0, 0.0)


# Hyperparameter tuning

## Configurations

In [57]:
from sklearn.model_selection import ParameterGrid

multinomialNB_params = {
    'alpha' : [.01,.02,.05,.1,.2,.5,1.0,2.0]
}

linearSVC_params = {
    'penalty' : ['l1','l2'],
    'dual' : [False],
    'tol' : [1e-2,1e-3],
    'fit_intercept' : [False],
    'class_weight' : ['balanced'],
    'max_iter' : [100],
    'random_state' : [42],
    'C' : [1,10]
}

TfidfVectorizer_params = {
    'stop_words' : [None,'english'],
    'ngram_range' : [(1,1),(1,2),(1,3)],
    'max_features' : [None,20000],
    'max_df' : [1.0],
    'min_df' : [1],
    'binary' : [True,False],
    'norm' : ['l1','l2'],
    'use_idf' : [True],
    'smooth_idf' : [True,False],
    'sublinear_tf' : [False]
}

Definition of a function that helps to generate the parameters to use in a pipeline in a grid search

In [None]:
def params_for_GridSearchCV(params_IN,step_name):
    return {f'{step_name}__{key}':value for (key,value) in params_IN.items()}

## Grid search

### Linear SVC

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

df_final = df.copy(deep=True)

# ---- REMOVAL OF RECORDS WITH DUPLICATE IDS ---- #
duplicated_ids = df_dev['ids'].value_counts()
duplicated_ids = duplicated_ids[duplicated_ids > 1] 
df_final = df_final.loc[
    ~(df_final['ids'].isin(list(duplicated_ids.index))) | 
    df_final['sentiment'].isna()
]

# ---- REMOVAL OF BOTS ---- #
df_final = df_final.loc[
    ~(df_final['user'].isin(user_suspiciousness[user_suspiciousness>.9].index)) | 
    df_final['sentiment'].isna()
]

# ---- OTHER PREPROCESSING STEPS ---- #
df_final = df_final\
    .pipe(text_noAmpEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noQuotEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noAt, field_IN='text', field_OUT='text')\
    .pipe(text_noHttp, field_IN='text', field_OUT='text')\
    .pipe(text_noDuplLetters, field_IN='text', field_OUT='text')\
    .pipe(text_noPunctuation, field_IN='text', field_OUT='text')\
    .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=SnowballStemmer)\
    .pipe(text_neg, field_IN='text', field_OUT='text')

df_final['text_final'] = df_final['text']

# ---- TRAIN - VALIDATION - TEST SPLIT ---- #
mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    train_size=0.9, 
    random_state=42
)

# ---- PIPELINE DEFINITION ---- #
vectorizer = 'tfidf'
model = 'linearSVC'

pipe = Pipeline([
    (vectorizer, TfidfVectorizer()),
    (model, LinearSVC())
])

# ---- GRID SEARCH ---- #
params_all = params_for_GridSearchCV(TfidfVectorizer_params,vectorizer)
params_all.update(params_for_GridSearchCV(linearSVC_params,model))
gscv = GridSearchCV(pipe, params_all, cv = 3, verbose = 4, scoring='f1_macro', n_jobs=4)

gscv.fit(X_train, y_train)

# ---- BEST MODEL EVALUATION ---- #
f1 = f1_score(y_valid, gscv.predict(X_valid),average='macro')
report = classification_report(y_valid, gscv.predict(X_valid))
confusion = confusion_matrix(y_valid, gscv.predict(X_valid))

print(f1)
print(report)
print(confusion)

# ---- GRID SEARCH RESULTS TO CSV ---- #
results_name = Path.cwd()/'linearSVC_results.csv'
pd.DataFrame(gscv.cv_results_).to_csv(results_name)

### Multinomial NB 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

df_final = df.copy(deep=True)

# ---- REMOVAL OF RECORDS WITH DUPLICATE IDS ---- #
duplicated_ids = df_dev['ids'].value_counts()
duplicated_ids = duplicated_ids[duplicated_ids > 1] 
df_final = df_final.loc[
    ~(df_final['ids'].isin(list(duplicated_ids.index))) | 
    df_final['sentiment'].isna()
]

# ---- REMOVAL OF BOTS ---- #
df_final = df_final.loc[
    ~(df_final['user'].isin(user_suspiciousness[user_suspiciousness>.9].index)) | 
    df_final['sentiment'].isna()
]

# ---- OTHER PREPROCESSING STEPS ---- #
df_final = df_final\
    .pipe(text_noAmpEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noQuotEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noAt, field_IN='text', field_OUT='text')\
    .pipe(text_noHttp, field_IN='text', field_OUT='text')\
    .pipe(text_noDuplLetters, field_IN='text', field_OUT='text')\
    .pipe(text_noPunctuation, field_IN='text', field_OUT='text')\
    .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=SnowballStemmer)\
    .pipe(text_neg, field_IN='text', field_OUT='text')

df_final['text_final'] = df_final['text']

# ---- TRAIN - VALIDATION - TEST SPLIT ---- #
mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    train_size=0.9, 
    random_state=42
)

# ---- PIPELINE DEFINITION ---- #
vectorizer = 'tfidf'
model = 'multinomialNB'

pipe = Pipeline([
    (vectorizer, TfidfVectorizer()),
    (model, MultinomialNB())
])

# ---- GRID SEARCH ---- #
params_all = params_for_GridSearchCV(TfidfVectorizer_params,vectorizer)
params_all.update(params_for_GridSearchCV(multinomialNB_params,model))
gscv = GridSearchCV(pipe, params_all, cv = 3, verbose = 4, scoring='f1_macro', n_jobs=4)

gscv.fit(X_train, y_train)

# ---- BEST MODEL EVALUATION ---- #
f1 = f1_score(y_valid, gscv.predict(X_valid),average='macro')
report = classification_report(y_valid, gscv.predict(X_valid))
confusion = confusion_matrix(y_valid, gscv.predict(X_valid))

print(f1)
print(report)
print(confusion)

# ---- GRID SEARCH RESULTS TO CSV ---- #
results_name = Path.cwd()/'multinomialNB.csv'
pd.DataFrame(gscv.cv_results_).to_csv(results_name)

# Best models test

In [21]:
from pathlib import Path
results = pd.read_csv(Path.cwd()/'linearSVC_results.csv')
results[results['rank_test_score'] == 1]['params'].iloc[0]

"{'linearSVC__C': 1, 'linearSVC__class_weight': 'balanced', 'linearSVC__dual': False, 'linearSVC__fit_intercept': False, 'linearSVC__max_iter': 100, 'linearSVC__penalty': 'l2', 'linearSVC__random_state': 42, 'linearSVC__tol': 0.001, 'tfidf__binary': True, 'tfidf__max_df': 1.0, 'tfidf__max_features': None, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 3), 'tfidf__norm': 'l2', 'tfidf__smooth_idf': False, 'tfidf__stop_words': None, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True}"

In [56]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

df_final = df.copy(deep=True)

# ---- REMOVAL OF RECORDS WITH DUPLICATE IDS ---- #
duplicated_ids = df_dev['ids'].value_counts()
duplicated_ids = duplicated_ids[duplicated_ids > 1] 
df_final = df_final.loc[
    ~(df_final['ids'].isin(list(duplicated_ids.index))) | 
    df_final['sentiment'].isna()
]

# ---- REMOVAL OF BOTS ---- #
df_final = df_final.loc[
    ~(df_final['user'].isin(user_suspiciousness[user_suspiciousness>.9].index)) | 
    df_final['sentiment'].isna()
]

# ---- OTHER PREPROCESSING STEPS ---- #
df_final = df_final\
    .pipe(text_noAmpEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noQuotEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noAt, field_IN='text', field_OUT='text')\
    .pipe(text_noHttp, field_IN='text', field_OUT='text')\
    .pipe(text_noDuplLetters, field_IN='text', field_OUT='text')\
    .pipe(text_noPunctuation, field_IN='text', field_OUT='text')\
    .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=SnowballStemmer)\
    .pipe(text_neg, field_IN='text', field_OUT='text')

df_final['text_final'] = df_final['text']

# ---- TRAIN - VALIDATION - TEST SPLIT ---- #
mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    # stratify=y_train_valid, 
    train_size=0.9, 
    random_state=42
)

# ---- PREPROCESS AND MODEL CONFIGURATION ---- #
model = LinearSVC(
    C = 1, 
    class_weight = 'balanced', 
    dual = False, 
    fit_intercept = False, 
    max_iter = 100, 
    penalty = 'l2', 
    random_state = 42, 
    tol = 0.001
)

vectorizer = TfidfVectorizer(
    binary = True, 
    max_df = 1.0, 
    max_features = None, 
    min_df = 1, 
    ngram_range = (1,3), 
    norm = 'l2', 
    smooth_idf = False, 
    stop_words = None, 
    sublinear_tf = False, 
    use_idf = True
)

# ---- PIPELINE DEFINITION ---- #
ml_pipe = Pipeline([
    ('tfidf', vectorizer),
    ('linearSVC', model)
])

ml_pipe.fit(X_train, y_train)

f1 = f1_score(y_valid, ml_pipe.predict(X_valid),average='macro')
report = classification_report(y_valid, ml_pipe.predict(X_valid))
confusion = confusion_matrix(y_valid, ml_pipe.predict(X_valid))

print(f1)
print(report)
print(confusion)

0.8038680794656297
              precision    recall  f1-score   support

         0.0       0.78      0.78      0.78      9559
         1.0       0.83      0.83      0.83     12727

    accuracy                           0.81     22286
   macro avg       0.80      0.80      0.80     22286
weighted avg       0.81      0.81      0.81     22286

[[ 7412  2147]
 [ 2135 10592]]


# Results csv export

In [68]:
ml_pipe.fit(X_train_valid, y_train_valid)
y_pred = ml_pipe.predict(X_test)
pd.DataFrame(y_pred.astype(int),columns=['Predicted']).to_csv("output.csv",index_label="Id", header=["Predicted"])