In [1]:
# !pip install datasets

In [2]:
# !pip install demoji


In [3]:
"""
# Just in case the file structure does not make the preprocessing available
import string
from xmlrpc.client import Boolean
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
from datasets import load_dataset
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.model_selection as ms
from sklearn.utils import resample
import demoji
import re



demoji.download_codes()

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def setup(rem_stop=True, do_stem=True, do_lem=False, split=True, split_on='preprocessed', upsample=True, do_emojis=True):
    df = load_data();
    df['preprocessed'] = preprocess(
        df['tweet'], rem_stop=rem_stop, do_stem=do_stem, do_lem=do_lem, do_emojis=do_emojis)

    tfidf = train_tfidf(df['preprocessed'])

    if split is True:
        df_train, df_test = split_data(df, split_on)
        if upsample is True:
            df_train = upsampling(df_train)
        return tfidf, df_train, df_test
    else:
        return tfidf, df


def load_data():
    dataset = load_dataset("tweets_hate_speech_detection")
    df = pd.DataFrame.from_dict(dataset['train'])
    return df


def preprocess(data, rem_stop=True, do_stem=True, do_lem=False, do_emojis=True):

    preprocessed = []
    for tweet in data:
        if do_emojis is True:
            tweet = convert_emoji(tweet)
        tokens = tokenization(remove_punctuation(tweet))
        if rem_stop is True:
            tokens = remove_stopwords(tokens)
        if do_stem is True and do_lem is False:
            tokens = stemming(tokens)
        if do_lem is True and do_stem is False:
            tokens = lemmatization(tokens)
        preprocessed.append(np.array(tokens))

    return preprocessed


def train_tfidf(data):
    def dummy(text):
        return text

    tf = TfidfVectorizer(
        analyzer='word',
        tokenizer=dummy,
        preprocessor=dummy,
        token_pattern=None)

    return tf.fit(data)


def split_data(df: pd.DataFrame, split_on='tweet', test_size=0.2, random_state=17):
    y = df['label']
    X = df[split_on]
    (X_train, X_test, y_train, y_test) = ms.train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y)

    df_train = pd.concat([y_train, X_train], axis=1)
    df_test = pd.concat([y_test, X_test], axis=1)

    return df_train, df_test


def upsampling(df: pd.DataFrame, replace=True, n_samples=23775, random_state=55):
    data_minority = df[df.label == 1]
    data_majority = df[df.label == 0]
    data_minority = resample(
        data_minority, replace=replace, n_samples=n_samples, random_state=random_state)

    return pd.concat([data_majority, data_minority])


def tokenization(text: str):
    return pd.Series(nltk.word_tokenize(text.lower()))


def remove_punctuation(tokens: pd.Series):
    return "".join([i for i in tokens if i not in punctuation])


def remove_stopwords(tokens: pd.Series):
    stopwords_list = stopwords.words("english")
    return tokens.apply(lambda token: token if token not in stopwords_list and token != '' else None).dropna()


def stemming(tokens: pd.Series):
    stemmer = PorterStemmer()

    return tokens.apply(lambda token: stemmer.stem(token))


def lemmatization(tokens: pd.Series):
    lemmatizer = WordNetLemmatizer()

    return tokens.apply(lambda token: lemmatizer.lemmatize(token))


def convert_emoji(text: str) -> str:
    # convert string to binary representation
    binary = ' '.join(format(ord(x), 'b') for x in text)

    # convert binary representation to utf8 representation
    listRes = list(binary.split(" "))
    try:
        text_with_emoji = bytes([int(x, 2) for x in listRes]).decode('utf-8')
    except UnicodeDecodeError:
        return text

    # get all emojis
    dictionary = demoji.findall(text_with_emoji)

    # replace emojis with text representation
    for key in dictionary.keys():
        text_with_emoji = text_with_emoji.replace(key, dictionary[key] + " ")

    return text_with_emoji


def get_features(df: pd.DataFrame):
    df["n_mentions"] = df["tweet"].apply(lambda x: count_user_mentions(x))
    df["hashtags"] = df["tweet"].apply(lambda x: identify_hashtags(x))

    return df

def count_user_mentions(text:str) ->int:
    return text.count("@user")

def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)
"""

'\n# Just in case the file structure does not make the preprocessing available\nimport string\nfrom xmlrpc.client import Boolean\nimport nltk\nfrom nltk.corpus import stopwords\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk.stem.porter import PorterStemmer\nimport pandas as pd\nimport numpy as np\nfrom datasets import load_dataset\nfrom string import punctuation\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nimport sklearn.model_selection as ms\nfrom sklearn.utils import resample\nimport demoji\nimport re\n\n\n\ndemoji.download_codes()\n\nnltk.download(\'punkt\')\nnltk.download(\'stopwords\')\nnltk.download(\'wordnet\')\n\n\ndef setup(rem_stop=True, do_stem=True, do_lem=False, split=True, split_on=\'preprocessed\', upsample=True, do_emojis=True):\n    df = load_data();\n    df[\'preprocessed\'] = preprocess(\n        df[\'tweet\'], rem_stop=rem_stop, do_stem=do_stem, do_lem=do_lem, do_emojis=do_emojis)\n\n    tfidf = train_tfidf(df[\'preprocessed\'])\n\n    if split

In [4]:
import os
import sys
sys.path.append(os.path.dirname((os.path.abspath(''))))
from src.data.preprocessing import load_data, preprocess, train_tfidf, split_data, upsampling, get_features, setup
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


  demoji.download_codes()
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# preprocessing (wip)
tfidf, df_train, df_test = setup()

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
print('There is {} training data, of which {}% is hate speech '.format(df_train['label'].count(), round(df_train['label'].sum()/df_train['label'].count()*100,2)))
print('There is {} test data, of which {}% is hate speech '.format(df_test['label'].count(), round(df_test['label'].sum()/df_test['label'].count()*100,2)))

There is 47550 training data, of which 50.0% is hate speech 
There is 6393 test data, of which 7.01% is hate speech 


# Support Vector Machine

In [7]:
import os
import sys
import pandas as pd
sys.path.append(os.path.dirname((os.path.abspath(''))))
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from src.data.preprocessing import load_data, preprocess, train_tfidf, split_data, upsampling, get_features, setup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [8]:
results_svm_cv = pd.DataFrame(["param", 'precision', 'recall', 'accuracy', 'F1'])

In [9]:
# Don't Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=False, do_stem=True, do_lem=False, split=True, upsample=False, do_emojis=False)


Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
#idea for final finetuning
"""
param_grid={'C': [x for x in range (1, 202, 25)],  
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ["linear", "rbf", "poly", "sigmoid"],
           'class_weight': ["balanced", None]
           }
"""


'\nparam_grid={\'C\': [x for x in range (1, 202, 25)],  \n            \'gamma\': [1, 0.1, 0.01, 0.001, 0.0001],\n            \'kernel\': ["linear", "rbf", "poly", "sigmoid"],\n           \'class_weight\': ["balanced", None]\n           }\n'

In [11]:
#smaller grid for old laptops:
param_grid={'C': [1],
            'kernel': ["linear"],
           'class_weight': [None]
           }


In [12]:
def train_svm(df_train: pd.DataFrame, tfidf: TfidfVectorizer):

    X_train = tfidf.transform(df_train['preprocessed'])
    y_train = df_train['label']

    # C-Support  Support Vector Machine
    svm_grid = GridSearchCV(svm.SVC(random_state=55), param_grid=param_grid, verbose=10, n_jobs=-1, scoring='f1', cv=5)
    svm_grid.fit(X_train, y_train);
    
    return svm_grid.best_estimator_

Parameter "class_weight = 'balanced' " has proven most useful, however it does have same/similar effect so upsamling; therefore left out of analysis for now.

In [13]:
def test_model(model, df_test: pd.DataFrame, tfidf: TfidfVectorizer):
    
    X_test = tfidf.transform(df_test['preprocessed'])
    y_test = df_test['label']
    y_pred = model.predict(X_test)

    predictions = []

    predictions.append(model.get_params())
    predictions.append(precision_score(y_test, y_pred))
    predictions.append(recall_score(y_test, y_pred))
    predictions.append(accuracy_score(y_test, y_pred))
    predictions.append(f1_score(y_test, y_pred))

    return predictions

In [14]:
# Don't Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=False, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=False)
svm_model = train_svm(df_train, tfidf)
results_svm_cv['Only Tokenization'] = test_model(svm_model, df_test, tfidf)

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [15]:
results_svm_cv.to_csv(r'result_svm_1.csv', header=True, index=None, sep=';', mode='a')
print(results_svm_cv['Only Tokenization'])

0    {'C': 1, 'break_ties': False, 'cache_size': 20...
1                                             0.915254
2                                             0.482143
3                                             0.960582
4                                             0.631579
Name: Only Tokenization, dtype: object


In [16]:
# Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=False)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['Remove Stopwords'] = test_model(svm_cv, df_test, tfidf)
results_svm_cv.to_csv(r'result_svm_2.csv', header=True, index=None, sep=';', mode='a')

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [17]:
# Remove Stopwords, Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['Emojis'] = test_model(svm_cv, df_test, tfidf)
results_svm_cv.to_csv(r'result_svm_3.csv', header=True, index=None, sep=';', mode='a')

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [18]:
# Remove Stopwords, Emojis, Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=True, do_lem=False, split=True, upsample=False, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['Stemming'] = test_model(svm_cv, df_test, tfidf)
results_svm_cv.to_csv(r'result_svm_4.csv', header=True, index=None, sep=';', mode='a')

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [19]:
# Remove Stopwords, Emojis, Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=True, do_lem=False, split=True, upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['Upsampling'] = test_model(svm_cv, df_test, tfidf)
results_svm_cv.to_csv(r'result_svm_5.csv', header=True, index=None, sep=';', mode='a')

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [20]:
# Remove Stopwords, Emojis, No Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['All_but_stemming'] = test_model(svm_cv, df_test, tfidf)
results_svm_cv.to_csv(r'result_svm_6.csv', header=True, index=None, sep=';', mode='a')

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [21]:
results_svm_cv
results_svm_cv.to_csv(r'result_svm.csv', header=True, index=None, sep=';', mode='a')
