In [45]:
from zipfile import ZipFile
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
# import cupy 
# import cudf

In [47]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')
df_eval = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'evaluation.csv')
df = df_dev.append(df_eval)

## Preprocessing

Removal of '&amp'

In [48]:
def text_noAmpEnt(df, field_IN, field_OUT):
    # df[field_OUT] = df[field_IN].apply(lambda x : [i.replace('&amp;','') for i in x])
    df[field_OUT] = df[field_IN].str.replace('&amp;','',case=False)
    return df

Removal of '&quot'

In [49]:
def text_noQuotEnt(df, field_IN, field_OUT):
    # df[field_OUT] = df[field_IN].apply(lambda x : [i for i in x if '&quot;' not in i])
    df[field_OUT] = df[field_IN].str.replace('&quot;','',case=False)
    return df

Removal of '@words'

In [50]:
def text_noAt(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('@'))])\
        .str.join(' ')
    return df

Removal of '&words'

In [51]:
def text_noAmp(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('&'))])\
        .str.join(' ')
    return df

Removal of 'http:words'

In [52]:
def text_noHttp(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [i for i in x if not(i.startswith('http'))])\
        .str.join(' ')
    return df

Removal of repeated letters

In [53]:
import re 
def text_noDuplLetters(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [re.sub(r'(.)\1+', r'\1', word) for word in x])\
        .str.join(' ')
    return df


Remove puntuation

In [54]:
import string as py_string
def text_noPunctuation(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN].str.translate(str.maketrans('', '', py_string.punctuation))
    return df

Stemming and lemmatization

In [55]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import download as nltk_download
nltk_download('wordnet')
nltk_download('omw-1.4')

def text_stem(df, field_IN, field_OUT, stemmer):
    if stemmer == SnowballStemmer:
        stemmer_to_use = SnowballStemmer('english')
    else: 
        stemmer_to_use = stemmer()
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [stemmer_to_use.stem(word) for word in x])\
        .str.join(' ')
    return df

def text_lemm(df, field_IN, field_OUT, lemmatizer):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : [lemmatizer().lemmatize(word) for word in x])\
        .str.join(' ')
    return df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Negation

In [56]:
from nltk.sentiment.util import mark_negation
def text_neg(df, field_IN, field_OUT):
    df[field_OUT] = df[field_IN]\
        .str.split()\
        .apply(lambda x : mark_negation(x))\
        .str.join(' ')
    return df

User manual filter

In [57]:
# df = df.loc[
#     (~(df['user'] == 'lost_dog') &
#     ~(df['user'] == 'webwoke') &
#     ~(df['user'] == 'tweetpet') &
#     ~(df['user'].str.contains('tweeteradder')) &
#     ~(df['user'].str.contains('tweetfollow')) &
#     ~(df['user'] == 'divxdownloads')) |
#     df['sentiment'].isna()
# ]
# df[df['sentiment'].isna()].shape

## Model creation

In [64]:
from sklearn.model_selection import ParameterGrid

multinomialNB_params = {
    'alpha' : [.01,.02,.05,.1,.2,.5,1.0,2.0]
}

TfidfVectorizer_params = {
    'stop_words' : [None,'english'],
    'ngram_range' : [(1,1),(1,2),(1,3)],
    'max_features' : [None,20000],
    'max_df' : [1.0],
    'min_df' : [1],
    'binary' : [True,False],
    'norm' : ['l1','l2'],
    'use_idf' : [True],
    'smooth_idf' : [True,False],
    'sublinear_tf' : [False]
}


number_different_configurations = len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(multinomialNB_params)))

print(number_different_configurations, number_different_configurations*30/60/60)

768 6.4


# User suspiciousness

average_user_sentiment and extreme_sentiment

In [59]:
from sklearn.preprocessing import minmax_scale
average_user_sentiment = df_dev.groupby('user')['sentiment'].mean()
average_user_sentiment = pd.Series(pd.DataFrame(average_user_sentiment)['sentiment'])
print(average_user_sentiment[average_user_sentiment.index=='lost_dog'])
extreme_sentiment = np.abs(0.5-average_user_sentiment)
extreme_sentiment = pd.Series(minmax_scale(extreme_sentiment.values),index=extreme_sentiment.index)
print(extreme_sentiment[extreme_sentiment.index=='lost_dog'])

extreme_sentiment.max(), extreme_sentiment.min()

user
lost_dog    0.0
Name: sentiment, dtype: float64
user
lost_dog    1.0
dtype: float64


(1.0, 0.0)

tweet_per_user

In [60]:
from sklearn.preprocessing import minmax_scale
tweet_per_user = df_dev['user'].value_counts()
# tweet_per_user = pd.Series(normalize(tweet_per_user.values.reshape(1,-1),'max').squeeze(),index=tweet_per_user.index)
tweet_per_user = pd.Series(minmax_scale(tweet_per_user.values),index=tweet_per_user.index)

tweet_per_user.max(), tweet_per_user.min()

(1.0, 0.0)

user_similarity

In [61]:
from sklearn.preprocessing import minmax_scale
user_similarity = pd.Series(pd.read_csv('internal_similarity_users.csv').set_index('user')['similarity'])
user_similarity = pd.Series(minmax_scale(user_similarity.values),index=user_similarity.index)

user_similarity.max(), user_similarity.min()

(1.0, 0.0)

user_suspiciousness

In [62]:
user_suspiciousness = (extreme_sentiment*user_similarity)
print(extreme_sentiment[extreme_sentiment.index=='lost_dog'])
print(user_similarity[user_similarity.index=='lost_dog'])

print(user_suspiciousness[user_suspiciousness.index=='lost_dog'])


user
lost_dog    1.0
dtype: float64
user
lost_dog    0.888044
dtype: float64
user
lost_dog    0.888044
dtype: float64


# Grid search

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

df_final = df.copy(deep=True)

# ---- REMOVAL OF RECORDS WITH DUPLICATE IDS ---- #
duplicated_ids = df_dev['ids'].value_counts()
duplicated_ids = duplicated_ids[duplicated_ids > 1] 
df_final = df_final.loc[
    ~(df_final['ids'].isin(list(duplicated_ids.index))) | 
    df_final['sentiment'].isna()
]

# ---- REMOVAL OF BOTS ---- #
# user_similarity = pd.Series(pd.read_csv('internal_similarity_users.csv').set_index('user')['similarity'])
df_final = df_final.loc[
    ~(df_final['user'].isin(user_suspiciousness[user_suspiciousness>.9].index)) | 
    df_final['sentiment'].isna()
]

# ---- Preprocessing ---- #
df_final = df_final\
    .pipe(text_noAmpEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noQuotEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noAt, field_IN='text', field_OUT='text')\
    .pipe(text_noHttp, field_IN='text', field_OUT='text')\
    .pipe(text_noDuplLetters, field_IN='text', field_OUT='text')\
    .pipe(text_noPunctuation, field_IN='text', field_OUT='text')\
    .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=SnowballStemmer)\
    .pipe(text_neg, field_IN='text', field_OUT='text')
    # .pipe(text_lemm, field_IN='text', field_OUT='text', lemmatizer=WordNetLemmatizer)
    # .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=LancasterStemmer)

df_final['text_final'] = df_final['text']

mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    train_size=0.9, 
    random_state=42
)

vectorizer = 'tfidf'
model = 'multinomialNB'

pipe = Pipeline([
    (vectorizer, TfidfVectorizer()),
    (model, MultinomialNB())
])

def params_for_GridSearchCV(params_IN,step_name):
    return {f'{step_name}__{key}':value for (key,value) in params_IN.items()}

params_all = params_for_GridSearchCV(TfidfVectorizer_params,vectorizer)
params_all.update(params_for_GridSearchCV(multinomialNB_params,model))
print(params_all)

gscv = GridSearchCV(pipe, params_all, cv = 3, verbose = 4, scoring='f1_macro', n_jobs=5)

gscv.fit(X_train, y_train)

f1 = f1_score(y_valid, gscv.predict(X_valid),average='macro')
report = classification_report(y_valid, gscv.predict(X_valid))
confusion = confusion_matrix(y_valid, gscv.predict(X_valid))

print(f1)
print(report)
print(confusion)

from pathlib import Path
from datetime import datetime

results_name = Path.cwd()/'gscv_results'/'multinomialNB_final_results.csv'

pd.DataFrame(gscv.cv_results_).to_csv(results_name)

KeyboardInterrupt: 

In [20]:
# results = pd.read_csv('gscv_results\\linearSVC_final_results2.csv')
# print(results.loc[results['rank_test_score']==1]['params'].iloc[0])
# print(results.loc[results['rank_test_score']==1]['params'].iloc[1])

# MODIFIED: norm and ngram_range

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

# text_noAmpEnt
# text_noQuotEnt
# text_noAt
# text_noAmp
# text_noHttp
# text_noDuplLetters
# text_noPunctuation
# text_stem
# text_lemm
# text_neg

df_final = df.copy(deep=True)

# ---- REMOVAL OF RECORDS WITH DUPLICATE IDS ---- #
duplicated_ids = df_dev['ids'].value_counts()
duplicated_ids = duplicated_ids[duplicated_ids > 1] 
df_final = df_final.loc[
    ~(df_final['ids'].isin(list(duplicated_ids.index))) | 
    df_final['sentiment'].isna()
]

# ---- REMOVAL OF BOTS ---- #
# user_similarity = pd.Series(pd.read_csv('internal_similarity_users.csv').set_index('user')['similarity'])
df_final = df_final.loc[
    ~(df_final['user'].isin(user_suspiciousness[user_suspiciousness>.9].index)) | 
    df_final['sentiment'].isna()
]

# ---- Preprocessing ---- #
df_final = df_final\
    .pipe(text_noAmpEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noQuotEnt, field_IN='text', field_OUT='text')\
    .pipe(text_noAt, field_IN='text', field_OUT='text')\
    .pipe(text_noHttp, field_IN='text', field_OUT='text')\
    .pipe(text_noDuplLetters, field_IN='text', field_OUT='text')\
    .pipe(text_noPunctuation, field_IN='text', field_OUT='text')\
    .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=SnowballStemmer)\
    .pipe(text_neg, field_IN='text', field_OUT='text')
    # .pipe(text_lemm, field_IN='text', field_OUT='text', lemmatizer=WordNetLemmatizer)
    # .pipe(text_stem, field_IN='text', field_OUT='text', stemmer=LancasterStemmer)

df_final['text_final'] = df_final['text']

mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    # stratify=y_train_valid, 
    train_size=0.9, 
    random_state=42
)

model = MultinomialNB(alpha=0.05)

vectorizer = TfidfVectorizer(
    binary = True, 
    max_df = 1.0, 
    max_features = None, 
    min_df = 1, 
    ngram_range = (1,3), 
    norm = 'l2', 
    smooth_idf = False, 
    stop_words = None, 
    sublinear_tf = False, 
    use_idf = True
)

ml_pipe = Pipeline([
    ('tfidf', vectorizer),
    ('MultinomialNB', model)
])

ml_pipe.fit(X_train, y_train)

f1 = f1_score(y_valid, ml_pipe.predict(X_valid),average='macro')
report = classification_report(y_valid, ml_pipe.predict(X_valid))
confusion = confusion_matrix(y_valid, ml_pipe.predict(X_valid))

print(f1)
print(report)
print(confusion)

0.7603126409779999
              precision    recall  f1-score   support

         0.0       0.75      0.69      0.72      9559
         1.0       0.78      0.82      0.80     12727

    accuracy                           0.77     22286
   macro avg       0.76      0.76      0.76     22286
weighted avg       0.77      0.77      0.77     22286

[[ 6631  2928]
 [ 2255 10472]]


Manually removing users: 0.8019786529108555  
Without manually removing users: 0.8007152526552623

removing duplicate rows : 0.8047867104222173
removing duplicate rows + removal of users with highest similarity : 0.8047867104222173

None : 0.7973008380157585  
text_noAmpEnt : 0.806213112745712  
text_noQuotEnt : 0.806582931090972  
text_noAt : 0.7998564194923328 -- saltato  
text_noAmp : 0.7994236544375262 -- saltato  
text_noHttp : 0.7997304233939981 -- saltato  
text_noDuplLetters : 0.8017853157058441 -- saltato  
text_noPunctuation : 0.8020861827824957 --saltato  
text_stem :   
    Porter: 0.8014799266049696  
    Lancaster : 0.7974966469180449  
text_lemm : 0.797157501260997  
text_neg : 0.7973008380157585  

[  
    text_noAmpEnt  
    text_noQuotEnt  
] : 0.806582931090972  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_lemm  
    text_neg  
] : 0.7982539044206275  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_stem(Porter)  
    text_neg  
] : 0.8000375978408897  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_stem(Lancaster)  
    text_neg  
] : 0.7961716118953209  
  
[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation  
    text_stem(Snowball)  
    text_neg  
] : 0.8006420407066549  

[  
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noPunctuation    
    text_neg  
] : 0.8006420407066549  

[
    text_noAmpEnt  
    text_noQuotEnt  
    text_noAt  
    text_noHttp  
    text_noDuplLetters  
    text_noPunctuation  
    text_stem  
    text_neg  
] : 0.8016248133460189

Best configuration

In [205]:
model = LinearSVC(
    C = 1, 
    class_weight = 'balanced', 
    dual = False, 
    fit_intercept = False, 
    max_iter = 50, 
    penalty = 'l2', 
    random_state = 42, 
    tol = 0.001
)

vectorizer = TfidfVectorizer(
    binary = True, 
    max_df = 1.0, 
    max_features = None, 
    min_df = 1, 
    ngram_range = (1,3), 
    norm = 'l2', 
    smooth_idf = False, 
    stop_words = None, 
    sublinear_tf = False, 
    use_idf = True
)

+ NO stemming: 0.7953890537516292
+ Snowball: 0.7994796670641988
+ Lancaster: 0.7934227929849921
+ Porter: 0.7982780155296589
+ Wordnet: 0.7942085265529459

Results generation

In [134]:
ml_pipe.fit(X_train_valid, y_train_valid)
y_pred = ml_pipe.predict(X_test)
pd.DataFrame(y_pred.astype(int),columns=['Predicted']).to_csv("output_gram13_final2.csv",index_label="Id", header=["Predicted"])