In [1]:
import string
import re

import spacy
import nltk
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

2023-06-10 17:33:13.484526: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-10 17:33:13.553889: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-10 17:33:13.555193: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Proprocessing Pipeline

In [2]:
# English spacy model
nlp = spacy.load("en_core_web_sm")
# Stop word list from NLTK
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.append('@user')
stopword_list.append('url')
# Punctuation list from string
puncts = string.punctuation


In [3]:
# Kontraktionen exandieren -> Negation
def decontract_phrase(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"shan\'t", "shall not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not ", phrase)
    phrase = re.sub(r"\'re ", " are ", phrase)
    phrase = re.sub(r"\'s ", " is ", phrase)
    phrase = re.sub(r"\'d ", " would ", phrase)
    phrase = re.sub(r"\'ll ", " will ", phrase)
    phrase = re.sub(r"\'t ", " not ", phrase)
    phrase = re.sub(r"\'ve ", " have ", phrase)
    phrase = re.sub(r"\'m ", " am ", phrase)
    
    phrase = re.sub(r'\s+', ' ', phrase)

    return phrase

In [4]:
def get_lemma(review):
    doc = nlp(review)
    lemma_text = ' '.join([token.lemma_ for token in doc])
    
    return lemma_text


In [5]:
def preprocess_pipeline(review):
    # Kontraktionen expandieren
    review = decontract_phrase(review)
    # Tokenisierung und Lemmatisierung
    review = get_lemma(review)    
    # Stopwords
    review = ' '.join([token for token in review.split() if token.lower() not in stopword_list])
    # Satzzeichen
    review = ''.join([character for character in review if character not in puncts])
    # Remove multiple whitespaces
    review = re.sub(r'^\s+', '', review)
    review = re.sub(r' +', ' ', review)
    review = re.sub(r'\s+$', '', review)
    
    return review

test = ".@USER @USER and @USER MP @USER praises the 'innovative #publicsector thinking' of @USER this week in @USER  If Britain is to prosper in the 21st century, it is through embracing the #digital economy URL URL"
preprocess_pipeline(test)

'USER MP praise innovative publicsector think week Britain prosper 21st century embrace digital economy'

In [6]:
dataset = pd.read_csv('./Praktikum_OLID_dev.csv')
dataset.head()

Unnamed: 0,Text,Task
0,. @USER @USER @USER @USER Fake conservatives a...,NOT
1,@USER @USER You are sick in the head. This man...,OFF
2,🛑 Truthfeed News 🛑 👉 'Schumer and Feinstein Go...,NOT
3,@USER @USER best lead @USER &amp; @USER most u...,NOT
4,@USER @USER The liberals can never handle the ...,OFF


In [7]:
# Are all columns defined?
# Non-Null Count shows how many rows for each column are well-defined
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2620 entries, 0 to 2619
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2620 non-null   object
 1   Task    2620 non-null   object
dtypes: object(2)
memory usage: 41.1+ KB


In [8]:
dataset['Preprocessed'] = dataset['Text'].apply(preprocess_pipeline)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2620 entries, 0 to 2619
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          2620 non-null   object
 1   Task          2620 non-null   object
 2   Preprocessed  2620 non-null   object
dtypes: object(3)
memory usage: 61.5+ KB


In [9]:
dataset.head()

Unnamed: 0,Text,Task,Preprocessed
0,. @USER @USER @USER @USER Fake conservatives a...,NOT,fake conservative fake outrage budget betrayal
1,@USER @USER You are sick in the head. This man...,OFF,sick head man lose daughter
2,🛑 Truthfeed News 🛑 👉 'Schumer and Feinstein Go...,NOT,🛑 Truthfeed News 🛑 👉 Schumer Feinstein Go HYST...
3,@USER @USER best lead @USER &amp; @USER most u...,NOT,good lead amp underrate
4,@USER @USER The liberals can never handle the ...,OFF,liberal never handle truth truth make head 💥 💥...


In [10]:
cnt_vec = CountVectorizer()
X_cnt = cnt_vec.fit_transform(dataset['Preprocessed'])
print(X_cnt.shape)

(2620, 6598)


In [11]:
dataset['char_count'] = dataset['Text'].apply(len)
dataset['word_count'] = dataset['Text'].apply(lambda x: len(x.split()))
dataset['Density'] = dataset['char_count'] / (dataset['word_count']+1)
dataset['punctuation_count'] = dataset['Text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
dataset['Punct_Count_Ratio'] = dataset['punctuation_count'] / dataset['word_count']

In [12]:
dataset.head()

Unnamed: 0,Text,Task,Preprocessed,char_count,word_count,Density,punctuation_count,Punct_Count_Ratio
0,. @USER @USER @USER @USER Fake conservatives a...,NOT,fake conservative fake outrage budget betrayal,100,17,5.555556,5,0.294118
1,@USER @USER You are sick in the head. This man...,OFF,sick head man lose daughter,65,13,4.642857,4,0.307692
2,🛑 Truthfeed News 🛑 👉 'Schumer and Feinstein Go...,NOT,🛑 Truthfeed News 🛑 👉 Schumer Feinstein Go HYST...,134,21,6.090909,5,0.238095
3,@USER @USER best lead @USER &amp; @USER most u...,NOT,good lead amp underrate,55,9,5.5,6,0.666667
4,@USER @USER The liberals can never handle the ...,OFF,liberal never handle truth truth make head 💥 💥...,90,15,5.625,5,0.333333


In [13]:
X_dense = X_cnt.toarray()

X_ling = np.vstack((dataset['Density'], 
                    dataset['Punct_Count_Ratio'])).T

scaler = MinMaxScaler()
X_ling_scale = scaler.fit_transform(X_ling)
X = np.hstack((X_dense, X_ling_scale))

print(X.shape)

(2620, 6600)


In [14]:
# Encode Labels
encoder = LabelEncoder()
y = encoder.fit_transform(dataset['Task'])

print(y.shape)
print(y[1:100])

(2620,)
[1 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 1 1
 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0]


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [16]:
# C_range = np.logspace(-2, 10, 13)
# gamma_range = np.logspace(-9, 3, 13)
# param_grid = dict(gamma=gamma_range, C=C_range)
# "average": [True, False],

param_grid = {
    "l1_ratio": np.linspace(0, 1, num=10),
    "alpha": np.logspace(-9, 3, 13),
}

In [17]:
# now create a GridSearchCV object and fit it to the data
model = linear_model.SGDClassifier()
search = GridSearchCV(estimator=model, cv=10,
                      param_grid=param_grid,
                      verbose=1)

search.fit(X_train, y_train)

Fitting 10 folds for each of 130 candidates, totalling 1300 fits


In [20]:
# print('The best parameters are {0} with a score of {1:0.2f}'.format(search.best_params_, search.best_score_))
# print(search.best_params_.get('C'))


In [21]:
# Define model with optimized hyper-parameter
# average=search.best_params_.get('average'), 
opt_model = linear_model.SGDClassifier(l1_ratio=search.best_params_.get('l1_ratio'),
                                       alpha=search.best_params_.get('alpha'), penalty='elasticnet', fit_intercept=False)
# Fit model on training data
opt_model.fit(X_train, y_train)
# Predict test set with optimized model
y_opt = opt_model.predict(X_test)
# Evaluate model
accuracy = round(accuracy_score(y_test, y_opt), 4)
precision = round(precision_score(y_test, y_opt), 4)
recall = round(recall_score(y_test, y_opt), 4)
f1 = round(f1_score(y_test, y_opt), 4)
print('Accuracy {}\nPrecision {}\nRecall {}\nF1 {}'.format(accuracy, precision, recall, f1))


Accuracy 0.7481
Precision 0.646
Recall 0.4424
F1 0.5252
