In [1]:
import string
import re

import spacy
import nltk
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

2023-05-24 18:21:36.444010: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-24 18:21:36.707300: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-24 18:21:36.708720: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-24 18:21:39.430210: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-24 18:21:39.430921: W tensorflow/core/common_runtime/gpu/gpu_device.

Proprocessing Pipeline

In [2]:
# English spacy model
nlp = spacy.load("en_core_web_sm")
# Stop word list from NLTK
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.append('@user')
stopword_list.append('url')
# Punctuation list from string
puncts = string.punctuation


In [3]:
# Kontraktionen exandieren -> Negation
def decontract_phrase(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"shan\'t", "shall not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not ", phrase)
    phrase = re.sub(r"\'re ", " are ", phrase)
    phrase = re.sub(r"\'s ", " is ", phrase)
    phrase = re.sub(r"\'d ", " would ", phrase)
    phrase = re.sub(r"\'ll ", " will ", phrase)
    phrase = re.sub(r"\'t ", " not ", phrase)
    phrase = re.sub(r"\'ve ", " have ", phrase)
    phrase = re.sub(r"\'m ", " am ", phrase)
    
    phrase = re.sub(r'\s+', ' ', phrase)

    return phrase

In [4]:
def get_lemma(review):
    doc = nlp(review)
    lemma_text = ' '.join([token.lemma_ for token in doc])
    
    return lemma_text


In [5]:
def preprocess_pipeline(review):
    # Kontraktionen expandieren
    review = decontract_phrase(review)
    # Tokenisierung und Lemmatisierung
    review = get_lemma(review)    
    # Stopwords
    review = ' '.join([token for token in review.split() if token.lower() not in stopword_list])
    # Satzzeichen
    review = ''.join([character for character in review if character not in puncts])
    # Remove multiple whitespaces
    review = re.sub(r'^\s+', '', review)
    review = re.sub(r' +', ' ', review)
    review = re.sub(r'\s+$', '', review)
    
    return review

test = ".@USER @USER and @USER MP @USER praises the 'innovative #publicsector thinking' of @USER this week in @USER  If Britain is to prosper in the 21st century, it is through embracing the #digital economy URL URL"
preprocess_pipeline(test)

'USER MP praise innovative publicsector think week Britain prosper 21st century embrace digital economy'

In [6]:
dataset_train = pd.read_csv('./TrainAndDev-OLID/Praktikum_OLID_train.csv')
dataset_dev = pd.read_csv('./TrainAndDev-OLID/Praktikum_OLID_dev.csv')
dataset_train.head()

Unnamed: 0,Text,Task
0,@USER bakkt is doing what an ETF would have do...,OFF
1,@USER I can't 😭😭 he is already 26,NOT
2,@USER he is a psychic ain’t he,NOT
3,.@USER @USER and @USER MP @USER praises the 'i...,NOT
4,@USER @USER [Eric opens the door and runs to t...,OFF


In [7]:
# Are all columns defined?
# Non-Null Count shows how many rows for each column are well-defined
dataset_train.info()
dataset_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10481 entries, 0 to 10480
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    10481 non-null  object
 1   Task    10481 non-null  object
dtypes: object(2)
memory usage: 163.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2620 entries, 0 to 2619
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2620 non-null   object
 1   Task    2620 non-null   object
dtypes: object(2)
memory usage: 41.1+ KB


In [8]:
dataset_train['Preprocessed'] = dataset_train['Text'].apply(preprocess_pipeline)
dataset_dev['Preprocessed'] = dataset_dev['Text'].apply(preprocess_pipeline)
dataset_train.info()
dataset_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10481 entries, 0 to 10480
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          10481 non-null  object
 1   Task          10481 non-null  object
 2   Preprocessed  10481 non-null  object
dtypes: object(3)
memory usage: 245.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2620 entries, 0 to 2619
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          2620 non-null   object
 1   Task          2620 non-null   object
 2   Preprocessed  2620 non-null   object
dtypes: object(3)
memory usage: 61.5+ KB


In [9]:
dataset_train.head()
dataset_dev.head()

Unnamed: 0,Text,Task,Preprocessed
0,. @USER @USER @USER @USER Fake conservatives a...,NOT,fake conservative fake outrage budget betrayal
1,@USER @USER You are sick in the head. This man...,OFF,sick head man lose daughter
2,🛑 Truthfeed News 🛑 👉 'Schumer and Feinstein Go...,NOT,🛑 Truthfeed News 🛑 👉 Schumer Feinstein Go HYST...
3,@USER @USER best lead @USER &amp; @USER most u...,NOT,good lead amp underrate
4,@USER @USER The liberals can never handle the ...,OFF,liberal never handle truth truth make head 💥 💥...


In [10]:
cnt_vec = CountVectorizer()
X_cnt_train = cnt_vec.fit_transform(dataset_train['Preprocessed'])
X_cnt_dev = cnt_vec.fit_transform(dataset_dev['Preprocessed'])
print(X_cnt_train.shape)
print(X_cnt_dev.shape)

(10481, 14992)
(2620, 6598)


In [11]:
dataset_train['char_count'] = dataset_train['Text'].apply(len)
dataset_train['word_count'] = dataset_train['Text'].apply(lambda x: len(x.split()))
dataset_train['Density'] = dataset_train['char_count'] / (dataset_train['word_count']+1)
dataset_train['punctuation_count'] = dataset_train['Text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
dataset_train['Punct_Count_Ratio'] = dataset_train['punctuation_count'] / dataset_train['word_count']

dataset_dev['char_count'] = dataset_dev['Text'].apply(len)
dataset_dev['word_count'] = dataset_dev['Text'].apply(lambda x: len(x.split()))
dataset_dev['Density'] = dataset_dev['char_count'] / (dataset_dev['word_count']+1)
dataset_dev['punctuation_count'] = dataset_dev['Text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
dataset_dev['Punct_Count_Ratio'] = dataset_dev['punctuation_count'] / dataset_dev['word_count']

In [12]:
dataset_train.head()
dataset_dev.head()

Unnamed: 0,Text,Task,Preprocessed,char_count,word_count,Density,punctuation_count,Punct_Count_Ratio
0,. @USER @USER @USER @USER Fake conservatives a...,NOT,fake conservative fake outrage budget betrayal,100,17,5.555556,5,0.294118
1,@USER @USER You are sick in the head. This man...,OFF,sick head man lose daughter,65,13,4.642857,4,0.307692
2,🛑 Truthfeed News 🛑 👉 'Schumer and Feinstein Go...,NOT,🛑 Truthfeed News 🛑 👉 Schumer Feinstein Go HYST...,134,21,6.090909,5,0.238095
3,@USER @USER best lead @USER &amp; @USER most u...,NOT,good lead amp underrate,55,9,5.5,6,0.666667
4,@USER @USER The liberals can never handle the ...,OFF,liberal never handle truth truth make head 💥 💥...,90,15,5.625,5,0.333333


In [13]:
X_dense_train = X_cnt_train.toarray()
X_dense_dev = X_cnt_dev.toarray()

X_ling_train = np.vstack((dataset_train['Density'], 
                    dataset_train['Punct_Count_Ratio'])).T
X_ling_dev = np.vstack((dataset_dev['Density'], 
                    dataset_dev['Punct_Count_Ratio'])).T

scaler = MinMaxScaler()
X_ling_scale_train = scaler.fit_transform(X_ling_train)
X_ling_scale_dev = scaler.fit_transform(X_ling_dev)
X_train = np.hstack((X_dense_train, X_ling_scale_train))
X_dev = np.hstack((X_dense_dev, X_ling_scale_dev))

print(X_train.shape)
print(X_dev.shape)

(10481, 14994)
(2620, 6600)


In [14]:
# Encode Labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(dataset_train['Task'])
y_dev = encoder.fit_transform(dataset_dev['Task'])

print(y_train.shape)
print(y_dev.shape)
print(y_train[1:100])
print(y_dev[1:100])

(10481,)
(2620,)
[0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1
 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0]
[1 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 1 1
 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0]


In [16]:
random_state = 42
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
evaluation_list = list()

for fold, [train, test] in enumerate(kfold.split(X_train, y_train)):
    # Get test data
    X_test = X_train[test]
    y_test = y_train[test]
    # Define model
    model = svm.LinearSVC()
    # Fit model
    model.fit(X_train[train], y_train[train])
    # Predict test set with model
    y_pred = model.predict(X_test)
    # Evaluate model
    accuracy = round(accuracy_score(y_test, y_pred), 4)
    precision = round(precision_score(y_test, y_pred), 4)
    recall = round(recall_score(y_test, y_pred), 4)
    f1 = round(f1_score(y_test, y_pred), 4)
    # Append results
    evaluation_list.append([fold+1, accuracy, precision, recall, f1])
    print('Fold', fold+1, 'done')


Fold 1 done
Fold 2 done
Fold 3 done
Fold 4 done
Fold 5 done


In [17]:
# Export Results
acc_avg = [i[1] for i in evaluation_list]
prec_avg = [i[2] for i in evaluation_list]
recall_avg = [i[3] for i in evaluation_list]
f1_avg = [i[4] for i in evaluation_list]

evaluation_list.append(['AVG',
                        round(np.mean(acc_avg), 4),
                        round(np.mean(prec_avg), 4),
                        round(np.mean(recall_avg), 4),
                        round(np.mean(f1_avg), 4)])

evaluation_df = pd.DataFrame(evaluation_list, columns=['Fold', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
#evaluation_df.to_csv('./Wine_SVM_evaluation.csv', index=False, sep=';')
print(evaluation_df)

  Fold  Accuracy  Precision  Recall  F1-Score
0    1    0.7258     0.5921  0.5398    0.5647
1    2    0.7300     0.6050  0.5210    0.5599
2    3    0.7290     0.5987  0.5398    0.5677
3    4    0.7195     0.5825  0.5217    0.5505
4    5    0.7400     0.6167  0.5551    0.5843
5  AVG    0.7289     0.5990  0.5355    0.5654


In [18]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)

In [19]:
# now create a GridSearchCV object and fit it to the data
search = GridSearchCV(estimator=svm.SVC(),
                      param_grid=param_grid,
                      verbose=1)

search.fit(X_train, y_train)

Fitting 5 folds for each of 169 candidates, totalling 845 fits
