In [44]:
import string
import re

import spacy
import nltk
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from cleantext import clean

Proprocessing Pipeline

In [45]:
# English spacy model
nlp = spacy.load("en_core_web_sm")
# Stop word list from NLTK
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.append('@user')
stopword_list.append('url')
# Punctuation list from string
puncts = string.punctuation


In [46]:
# Kontraktionen exandieren -> Negation
def decontract_phrase(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"shan\'t", "shall not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not ", phrase)
    phrase = re.sub(r"\'re ", " are ", phrase)
    phrase = re.sub(r"\'s ", " is ", phrase)
    phrase = re.sub(r"\'d ", " would ", phrase)
    phrase = re.sub(r"\'ll ", " will ", phrase)
    phrase = re.sub(r"\'t ", " not ", phrase)
    phrase = re.sub(r"\'ve ", " have ", phrase)
    phrase = re.sub(r"\'m ", " am ", phrase)
    
    phrase = re.sub(r'\s+', ' ', phrase)

    return phrase

In [47]:
def get_lemma(review):
    doc = nlp(review)
    lemma_text = ' '.join([token.lemma_ for token in doc])
    
    return lemma_text


In [48]:
def preprocess_pipeline(review):
    # Kontraktionen expandieren
    review = decontract_phrase(review)
    # Tokenisierung und Lemmatisierung
    review = get_lemma(review)    
    # Stopwords
    review = ' '.join([token for token in review.split() if token.lower() not in stopword_list])
    # Satzzeichen
    review = ''.join([character for character in review if character not in puncts])
    # Remove multiple whitespaces
    review = re.sub(r'^\s+', '', review)
    review = re.sub(r' +', ' ', review)
    review = re.sub(r'\s+$', '', review)
    # remove Emojis
    # review = clean(review, no_emoji=True)
    
    return review

test = ".@USER @USER and @USER MP @USER praises the 'innovative #publicsector thinking' of @USER this week in @USER  If Britain is to prosper in the 21st century, it is through embracing the #digital economy URL URL"
test2 = "@USER She is beyond famous. She is Lalisa Manoban 😎 URL"
preprocess_pipeline(test)

'USER MP praise innovative publicsector think week Britain prosper 21st century embrace digital economy'

In [49]:
dataset = pd.read_csv('./Praktikum_OLID_train.csv')
dataset.head()

Unnamed: 0,Text,Task
0,@USER bakkt is doing what an ETF would have do...,OFF
1,@USER I can't 😭😭 he is already 26,NOT
2,@USER he is a psychic ain’t he,NOT
3,.@USER @USER and @USER MP @USER praises the 'i...,NOT
4,@USER @USER [Eric opens the door and runs to t...,OFF


In [50]:
# Are all columns defined?
# Non-Null Count shows how many rows for each column are well-defined
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10481 entries, 0 to 10480
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    10481 non-null  object
 1   Task    10481 non-null  object
dtypes: object(2)
memory usage: 163.9+ KB


In [51]:
dataset['Preprocessed'] = dataset['Text'].apply(preprocess_pipeline)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10481 entries, 0 to 10480
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          10481 non-null  object
 1   Task          10481 non-null  object
 2   Preprocessed  10481 non-null  object
dtypes: object(3)
memory usage: 245.8+ KB


In [52]:
dataset.head()

Unnamed: 0,Text,Task,Preprocessed
0,@USER bakkt is doing what an ETF would have do...,OFF,bakkt etf would not call etf FUKEN CUUUUUKS
1,@USER I can't 😭😭 he is already 26,NOT,not 😭 😭 already 26
2,@USER he is a psychic ain’t he,NOT,psychic ai not
3,.@USER @USER and @USER MP @USER praises the 'i...,NOT,USER MP praise innovative publicsector think w...
4,@USER @USER [Eric opens the door and runs to t...,OFF,Eric open door run couch fuck yeah


In [53]:
cnt_vec = CountVectorizer()
X_cnt = cnt_vec.fit_transform(dataset['Preprocessed'])
print(X_cnt.shape)

(10481, 14992)


In [54]:
dataset['char_count'] = dataset['Text'].apply(len)
dataset['word_count'] = dataset['Text'].apply(lambda x: len(x.split()))
dataset['Density'] = dataset['char_count'] / (dataset['word_count']+1)
dataset['punctuation_count'] = dataset['Text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
dataset['Punct_Count_Ratio'] = dataset['punctuation_count'] / dataset['word_count']

In [55]:
dataset.head()

Unnamed: 0,Text,Task,Preprocessed,char_count,word_count,Density,punctuation_count,Punct_Count_Ratio
0,@USER bakkt is doing what an ETF would have do...,OFF,bakkt etf would not call etf FUKEN CUUUUUKS,103,20,4.904762,5,0.25
1,@USER I can't 😭😭 he is already 26,NOT,not 😭 😭 already 26,33,8,3.666667,2,0.25
2,@USER he is a psychic ain’t he,NOT,psychic ai not,30,7,3.75,1,0.142857
3,.@USER @USER and @USER MP @USER praises the 'i...,NOT,USER MP praise innovative publicsector think w...,211,36,5.702703,13,0.361111
4,@USER @USER [Eric opens the door and runs to t...,OFF,Eric open door run couch fuck yeah,67,13,4.785714,6,0.461538


In [56]:
X_dense = X_cnt.toarray()

X_ling = np.vstack((dataset['Density'], 
                    dataset['Punct_Count_Ratio'])).T

scaler = MinMaxScaler()
X_ling_scale = scaler.fit_transform(X_ling)
X = np.hstack((X_dense, X_ling_scale))

print(X.shape)

(10481, 14994)


In [57]:
# Encode Labels
encoder = LabelEncoder()
y = encoder.fit_transform(dataset['Task'])

print(y.shape)
print(y[1:100])

(10481,)
[0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1
 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0]


In [58]:
random_state = 42
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
evaluation_list = list()

for fold, [train, test] in enumerate(kfold.split(X, y)):
    # Get test data
    X_test = X[test]
    y_test = y[test]
    # Define model
    model = linear_model.SGDClassifier()
    # model = svm.LinearSVC()
    # model = linear_model.SGDClassifier(loss='perceptron')
    # Fit model
    model.fit(X[train], y[train])
    # Predict test set with model
    y_pred = model.predict(X_test)
    # Evaluate model
    accuracy = round(accuracy_score(y_test, y_pred), 4)
    precision = round(precision_score(y_test, y_pred), 4)
    recall = round(recall_score(y_test, y_pred), 4)
    f1 = round(f1_score(y_test, y_pred), 4)
    # Append results
    evaluation_list.append([fold+1, accuracy, precision, recall, f1])
    print('Fold', fold+1, 'done')
    print(y_pred)

Fold 1 done
[1 1 1 ... 0 1 1]
Fold 2 done
[0 1 0 ... 1 1 0]
Fold 3 done
[1 1 0 ... 1 1 1]
Fold 4 done
[0 0 0 ... 1 0 0]
Fold 5 done
[0 0 0 ... 0 1 0]
Fold 6 done
[1 1 0 ... 0 0 0]
Fold 7 done
[0 0 0 ... 1 1 0]
Fold 8 done
[0 0 0 ... 0 0 0]
Fold 9 done
[0 0 1 ... 0 0 0]
Fold 10 done
[0 0 0 ... 0 0 1]


In [59]:
# Export Results
acc_avg = [i[1] for i in evaluation_list]
prec_avg = [i[2] for i in evaluation_list]
recall_avg = [i[3] for i in evaluation_list]
f1_avg = [i[4] for i in evaluation_list]

evaluation_list.append(['AVG',
                        round(np.mean(acc_avg), 4),
                        round(np.mean(prec_avg), 4),
                        round(np.mean(recall_avg), 4),
                        round(np.mean(f1_avg), 4)])

evaluation_df = pd.DataFrame(evaluation_list, columns=['Fold', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
#evaluation_df.to_csv('./Wine_SVM_evaluation.csv', index=False, sep=';')
print(evaluation_df)

   Fold  Accuracy  Precision  Recall  F1-Score
0     1    0.7407     0.6149  0.5723    0.5928
1     2    0.7395     0.6174  0.5549    0.5845
2     3    0.7338     0.6044  0.5607    0.5817
3     4    0.7500     0.6498  0.5217    0.5788
4     5    0.7281     0.5932  0.5536    0.5727
5     6    0.7385     0.6282  0.5043    0.5595
6     7    0.7214     0.5826  0.5420    0.5616
7     8    0.7099     0.5623  0.5362    0.5490
8     9    0.7433     0.6226  0.5594    0.5893
9    10    0.7519     0.6471  0.5420    0.5899
10  AVG    0.7357     0.6123  0.5447    0.5760
