In [22]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
Collecting pyarrow>=6.0.0
  Downloading pyarrow-8.0.0-cp39-cp39-win_amd64.whl (17.9 MB)
Collecting dill
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.12.2-py39-none-any.whl (128 kB)
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp39-cp39-win_amd64.whl (29 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
Installing collected packages: dill, xxhash, responses, pyarrow, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.2.1 dill-0.3.4 huggingface-hub-0.6.0 multiprocess-0.70.12.2 pyarrow-8.0.0 responses-0.18.0 xxhash-3.0.0


In [26]:
!pip install demoji


Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
Installing collected packages: demoji
Successfully installed demoji-1.1.0


In [27]:
import string
from xmlrpc.client import Boolean
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
from datasets import load_dataset
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.model_selection as ms
from sklearn.utils import resample
import demoji
import re



demoji.download_codes()

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def setup(rem_stop=True, do_stem=True, do_lem=False, split=True, split_on='preprocessed', upsample=True, do_emojis=True):
    df = load_data();
    df['preprocessed'] = preprocess(
        df['tweet'], rem_stop=rem_stop, do_stem=do_stem, do_lem=do_lem, do_emojis=do_emojis)

    tfidf = train_tfidf(df['preprocessed'])

    if split is True:
        df_train, df_test = split_data(df, split_on)
        if upsample is True:
            df_train = upsampling(df_train)
        return tfidf, df_train, df_test
    else:
        return tfidf, df


def load_data():
    dataset = load_dataset("tweets_hate_speech_detection")
    df = pd.DataFrame.from_dict(dataset['train'])
    return df


def preprocess(data, rem_stop=True, do_stem=True, do_lem=False, do_emojis=True):

    preprocessed = []
    for tweet in data:
        if do_emojis is True:
            tweet = convert_emoji(tweet)
        tokens = tokenization(remove_punctuation(tweet))
        if rem_stop is True:
            tokens = remove_stopwords(tokens)
        if do_stem is True and do_lem is False:
            tokens = stemming(tokens)
        if do_lem is True and do_stem is False:
            tokens = lemmatization(tokens)
        preprocessed.append(np.array(tokens))

    return preprocessed


def train_tfidf(data):
    def dummy(text):
        return text

    tf = TfidfVectorizer(
        analyzer='word',
        tokenizer=dummy,
        preprocessor=dummy,
        token_pattern=None)

    return tf.fit(data)


def split_data(df: pd.DataFrame, split_on='tweet', test_size=0.2, random_state=17):
    y = df['label']
    X = df[split_on]
    (X_train, X_test, y_train, y_test) = ms.train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y)

    df_train = pd.concat([y_train, X_train], axis=1)
    df_test = pd.concat([y_test, X_test], axis=1)

    return df_train, df_test


def upsampling(df: pd.DataFrame, replace=True, n_samples=23775, random_state=55):
    data_minority = df[df.label == 1]
    data_majority = df[df.label == 0]
    data_minority = resample(
        data_minority, replace=replace, n_samples=n_samples, random_state=random_state)

    return pd.concat([data_majority, data_minority])


def tokenization(text: str):
    return pd.Series(nltk.word_tokenize(text.lower()))


def remove_punctuation(tokens: pd.Series):
    return "".join([i for i in tokens if i not in punctuation])


def remove_stopwords(tokens: pd.Series):
    stopwords_list = stopwords.words("english")
    return tokens.apply(lambda token: token if token not in stopwords_list and token != '' else None).dropna()


def stemming(tokens: pd.Series):
    stemmer = PorterStemmer()

    return tokens.apply(lambda token: stemmer.stem(token))


def lemmatization(tokens: pd.Series):
    lemmatizer = WordNetLemmatizer()

    return tokens.apply(lambda token: lemmatizer.lemmatize(token))


def convert_emoji(text: str) -> str:
    # convert string to binary representation
    binary = ' '.join(format(ord(x), 'b') for x in text)

    # convert binary representation to utf8 representation
    listRes = list(binary.split(" "))
    try:
        text_with_emoji = bytes([int(x, 2) for x in listRes]).decode('utf-8')
    except UnicodeDecodeError:
        return text

    # get all emojis
    dictionary = demoji.findall(text_with_emoji)

    # replace emojis with text representation
    for key in dictionary.keys():
        text_with_emoji = text_with_emoji.replace(key, dictionary[key] + " ")

    return text_with_emoji


def get_features(df: pd.DataFrame):
    df["n_mentions"] = df["tweet"].apply(lambda x: count_user_mentions(x))
    df["hashtags"] = df["tweet"].apply(lambda x: identify_hashtags(x))

    return df

def count_user_mentions(text:str) ->int:
    return text.count("@user")

def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)


  demoji.download_codes()
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
import os
import sys
sys.path.append(os.path.dirname((os.path.abspath(''))))
from src.data.preprocessing import load_data, preprocess, train_tfidf, split_data, upsampling, get_features, setup
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


  demoji.download_codes()
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
# requires datasets library (use pip)
#df = load_data()
df.head()

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags,tfidf_stemmed_tokens,tfidf_stemmed_hashtags,tfidf_lemmatized_tokens,tfidf_lemmatized_hashtags
0,24090,0,best #lawofattraction #resources for #healing!...,0,"['lawofattraction', 'resources', 'healing', 'a...",best lawofattraction resources for healing ...,best lawofattraction resources for healing ...,"['lawofattraction', 'for', 'altwaystoheal', 'is']","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resources', 'healing', 'a...","['lawofattract', 'altwaystoh']","['lawofattract', 'resourc', 'heal', 'altwaysto...","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resource', 'healing', 'al...","[-54.797821044921875, 19.326093673706055]","[50.279502868652344, -0.368119478225708]","[30.39127540588379, -49.107627868652344]","[8.619098663330078, -19.02434539794922]"
1,15264,0,remembering to focus on the simplest happy mom...,0,"['blogger', 'blog', 'life']",remembering to focus on the simplest happy mom...,remembering to focus on the simplest happy mom...,"['to', 'on', 'simplest', 'moments', 'life', 'b...","['simplest', 'moments', 'life', 'blogger', 'li...","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","[12.769611358642578, 38.40631866455078]","[-0.20247356593608856, -12.449955940246582]","[-25.304161071777344, -11.912176132202148]","[-5.459761619567871, -11.88219928741455]"
2,19310,0,when you get as happy as your boyfriend to be ...,0,['silvia'],when you get as happy as your boyfriend to be ...,when you get as happy as your boyfriend to be ...,"['you', 'as', 'as', 'boyfriend', 'be', 'with',...","['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"[15.18979263305664, -9.596672058105469]","[10.67345905303955, -4.716570854187012]","[80.47900390625, 3.222534656524658]","[-2.534292459487915, -6.132740020751953]"
3,27244,0,why do you always try to make me happy? i don...,0,"['love', 'devotion']",why do you always try to make me happy i dont...,why do you always try to make me happy i dont...,"['do', 'always', 'to', 'me', 'i', 'know', 'to'...","['always', 'know', 'love']","['love', 'devotion']","['alway', 'know', 'love']","['love', 'devot']","['always', 'know', 'love']","['love', 'devotion']","[-46.13848876953125, -18.032955169677734]","[11.31847858428955, -16.98657989501953]","[40.34811019897461, -24.527305603027344]","[8.227179527282715, -13.818502426147461]"
4,6633,0,omg is finally here!!! #ps4 #farcry4 #gtav #un...,0,"['ps4', 'farcry4', 'gtav', 'unchaed4']",omg is finally here ps4 farcry4 gtav unchaed4,omg is finally here ps4 farcry4 gtav unchaed4,"['is', 'here', 'farcry4', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","[3.866750478744507, -5.09706449508667]","[21.51004409790039, 15.360187530517578]","[8.554491996765137, -20.749971389770508]","[-6.128843307495117, -11.832077980041504]"


In [30]:
# Create preprossesed data
df['preprocessed'] = preprocess(df['tweet'])
df.head()

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags,tfidf_stemmed_tokens,tfidf_stemmed_hashtags,tfidf_lemmatized_tokens,tfidf_lemmatized_hashtags,preprocessed
0,24090,0,best #lawofattraction #resources for #healing!...,0,"['lawofattraction', 'resources', 'healing', 'a...",best lawofattraction resources for healing ...,best lawofattraction resources for healing ...,"['lawofattraction', 'for', 'altwaystoheal', 'is']","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resources', 'healing', 'a...","['lawofattract', 'altwaystoh']","['lawofattract', 'resourc', 'heal', 'altwaysto...","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resource', 'healing', 'al...","[-54.797821044921875, 19.326093673706055]","[50.279502868652344, -0.368119478225708]","[30.39127540588379, -49.107627868652344]","[8.619098663330078, -19.02434539794922]","[best, lawofattract, resourc, heal, altwaystoh..."
1,15264,0,remembering to focus on the simplest happy mom...,0,"['blogger', 'blog', 'life']",remembering to focus on the simplest happy mom...,remembering to focus on the simplest happy mom...,"['to', 'on', 'simplest', 'moments', 'life', 'b...","['simplest', 'moments', 'life', 'blogger', 'li...","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","[12.769611358642578, 38.40631866455078]","[-0.20247356593608856, -12.449955940246582]","[-25.304161071777344, -11.912176132202148]","[-5.459761619567871, -11.88219928741455]","[rememb, focu, simplest, happi, moment, life, ..."
2,19310,0,when you get as happy as your boyfriend to be ...,0,['silvia'],when you get as happy as your boyfriend to be ...,when you get as happy as your boyfriend to be ...,"['you', 'as', 'as', 'boyfriend', 'be', 'with',...","['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"[15.18979263305664, -9.596672058105469]","[10.67345905303955, -4.716570854187012]","[80.47900390625, 3.222534656524658]","[-2.534292459487915, -6.132740020751953]","[get, happi, boyfriend, reunit, car, silvia]"
3,27244,0,why do you always try to make me happy? i don...,0,"['love', 'devotion']",why do you always try to make me happy i dont...,why do you always try to make me happy i dont...,"['do', 'always', 'to', 'me', 'i', 'know', 'to'...","['always', 'know', 'love']","['love', 'devotion']","['alway', 'know', 'love']","['love', 'devot']","['always', 'know', 'love']","['love', 'devotion']","[-46.13848876953125, -18.032955169677734]","[11.31847858428955, -16.98657989501953]","[40.34811019897461, -24.527305603027344]","[8.227179527282715, -13.818502426147461]","[alway, tri, make, happi, dont, know, make, sa..."
4,6633,0,omg is finally here!!! #ps4 #farcry4 #gtav #un...,0,"['ps4', 'farcry4', 'gtav', 'unchaed4']",omg is finally here ps4 farcry4 gtav unchaed4,omg is finally here ps4 farcry4 gtav unchaed4,"['is', 'here', 'farcry4', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","[3.866750478744507, -5.09706449508667]","[21.51004409790039, 15.360187530517578]","[8.554491996765137, -20.749971389770508]","[-6.128843307495117, -11.832077980041504]","[omg, final, ps4, farcry4, gtav, unchaed4]"


In [31]:
# Get n_usermentions and get seperate hashtags
df = get_features(df)
df.head()

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags,tfidf_stemmed_tokens,tfidf_stemmed_hashtags,tfidf_lemmatized_tokens,tfidf_lemmatized_hashtags,preprocessed
0,24090,0,best #lawofattraction #resources for #healing!...,0,"[lawofattraction, resources, healing, altwayst...",best lawofattraction resources for healing ...,best lawofattraction resources for healing ...,"['lawofattraction', 'for', 'altwaystoheal', 'is']","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resources', 'healing', 'a...","['lawofattract', 'altwaystoh']","['lawofattract', 'resourc', 'heal', 'altwaysto...","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resource', 'healing', 'al...","[-54.797821044921875, 19.326093673706055]","[50.279502868652344, -0.368119478225708]","[30.39127540588379, -49.107627868652344]","[8.619098663330078, -19.02434539794922]","[best, lawofattract, resourc, heal, altwaystoh..."
1,15264,0,remembering to focus on the simplest happy mom...,0,"[blogger, blog, life]",remembering to focus on the simplest happy mom...,remembering to focus on the simplest happy mom...,"['to', 'on', 'simplest', 'moments', 'life', 'b...","['simplest', 'moments', 'life', 'blogger', 'li...","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","[12.769611358642578, 38.40631866455078]","[-0.20247356593608856, -12.449955940246582]","[-25.304161071777344, -11.912176132202148]","[-5.459761619567871, -11.88219928741455]","[rememb, focu, simplest, happi, moment, life, ..."
2,19310,0,when you get as happy as your boyfriend to be ...,0,[silvia],when you get as happy as your boyfriend to be ...,when you get as happy as your boyfriend to be ...,"['you', 'as', 'as', 'boyfriend', 'be', 'with',...","['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"[15.18979263305664, -9.596672058105469]","[10.67345905303955, -4.716570854187012]","[80.47900390625, 3.222534656524658]","[-2.534292459487915, -6.132740020751953]","[get, happi, boyfriend, reunit, car, silvia]"
3,27244,0,why do you always try to make me happy? i don...,0,"[love, devotion]",why do you always try to make me happy i dont...,why do you always try to make me happy i dont...,"['do', 'always', 'to', 'me', 'i', 'know', 'to'...","['always', 'know', 'love']","['love', 'devotion']","['alway', 'know', 'love']","['love', 'devot']","['always', 'know', 'love']","['love', 'devotion']","[-46.13848876953125, -18.032955169677734]","[11.31847858428955, -16.98657989501953]","[40.34811019897461, -24.527305603027344]","[8.227179527282715, -13.818502426147461]","[alway, tri, make, happi, dont, know, make, sa..."
4,6633,0,omg is finally here!!! #ps4 #farcry4 #gtav #un...,0,"[ps4, farcry4, gtav, unchaed4]",omg is finally here ps4 farcry4 gtav unchaed4,omg is finally here ps4 farcry4 gtav unchaed4,"['is', 'here', 'farcry4', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","[3.866750478744507, -5.09706449508667]","[21.51004409790039, 15.360187530517578]","[8.554491996765137, -20.749971389770508]","[-6.128843307495117, -11.832077980041504]","[omg, final, ps4, farcry4, gtav, unchaed4]"


In [32]:
# Create trained tfidf vectorizer
tfidf = train_tfidf(df['preprocessed'])

In [33]:
# Split data on specified column
df_train , df_test = split_data(df, 'preprocessed')

In [34]:
# Upsample data
df_train_up = upsampling(df_train)
df_train_up.head()

Unnamed: 0,label,preprocessed
1823,0,"[cant, nice, everyon]"
2723,0,"[recycl, make, green, natur, wastewarrior]"
3024,0,"[10, premium, staer, kit, today, happi, oiler,..."
3793,0,"[tag, someon, quot, beauti, beach, lifestyl, b..."
2283,0,"[anyonebuttrump, go, backfir, user, lot, us, m..."


In [35]:
# Does all of the above (wip)
tfidf, df_train, df_test = setup()

Downloading builder script:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/881 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset tweets_hate_speech_detection/default (download: 2.96 MiB, generated: 3.04 MiB, post-processed: Unknown size, total: 6.00 MiB) to C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2...


Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31962 [00:00<?, ? examples/s]

Dataset tweets_hate_speech_detection downloaded and prepared to C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [38]:
#df_test.train()

In [37]:
print('There is {} training data, of which {}% is hate speech '.format(df_train['label'].count(), round(df_train['label'].sum()/df_train['label'].count()*100,2)))
print('There is {} test data, of which {}% is hate speech '.format(df_test['label'].count(), round(df_test['label'].sum()/df_test['label'].count()*100,2)))

There is 47550 training data, of which 50.0% is hate speech 
There is 6393 test data, of which 7.01% is hate speech 


# Support Vector Machine

In [40]:
import os
import sys
import pandas as pd
sys.path.append(os.path.dirname((os.path.abspath(''))))
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from src.data.preprocessing import load_data, preprocess, train_tfidf, split_data, upsampling, get_features, setup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [119]:
results_svm_cv = pd.DataFrame(['precision', 'recall', 'accuracy', 'F1'])

In [42]:
kernels = ["linear", "rbf", "poly", "sigmoid"]

In [57]:
# Don't Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=False, do_stem=False, do_lem=False, split=True, split_on='preprocessed', upsample=False, do_emojis=False)


Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [106]:
def train_svm(df_train: pd.DataFrame, tfidf: TfidfVectorizer):

    X_train = tfidf.transform(df_train['preprocessed'])
    y_train = df_train['label']

    # C-Support  Support Vector Machine
    svm_grid = ms.GridSearchCV(svm.SVC(), param_grid={'C': [1], 'kernel': ["linear"]})
    svm_grid.fit(X_train, y_train);
    
    return svm_grid

In [79]:
"""
#idea for final finetuning
param_grid={'C': [0.001,0.01,0.1,1,10,100,1000],  
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ["linear", "rbf", "poly", "sigmoid"],
            'class_weight': [None,"balanced"]}
"""

'\n#idea for final finetuning\nparam_grid={\'C\': [0.001,0.01,0.1,1,10,100,1000],  \n            \'gamma\': [1, 0.1, 0.01, 0.001, 0.0001],\n            \'kernel\': ["linear", "rbf", "poly", "sigmoid"],\n            \'class_weight\': [None,"balanced"]}\n'

In [107]:
def test_model(model, df_test: pd.DataFrame, tfidf: TfidfVectorizer):
    
    X_test = tfidf.transform(df_test['preprocessed'])
    y_test = df_test['label']
    y_pred = model.predict(X_test)

    predictions = []

    predictions.append(precision_score(y_test, y_pred))
    predictions.append(recall_score(y_test, y_pred))
    predictions.append(accuracy_score(y_test, y_pred))
    predictions.append(f1_score(y_test, y_pred))

    return predictions

In [120]:
# Don't Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=False, do_stem=False, do_lem=False, split=True, split_on='preprocessed', upsample=False, do_emojis=False)
svm_model = train_svm(df_train, tfidf)
results_svm_cv['Only Tokenization'] = test_model(svm_model, df_test, tfidf)


Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [122]:
#print(results_svm_cv['Only Tokenization'])

0    0.915254
1    0.482143
2    0.960582
3    0.631579
Name: Only Tokenization, dtype: float64


In [123]:
# Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, split_on='preprocessed', upsample=False, do_emojis=False)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['Remove Stopwords'] = test_model(svm_cv, df_test, tfidf)


Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [124]:
# Remove Stopwords, Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, split_on='preprocessed', upsample=False, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['Emojis'] = test_model(svm_cv, df_test, tfidf)

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [125]:
# Remove Stopwords, Emojis, Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=True, do_lem=False, split=True, split_on='preprocessed', upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['Upsampling'] = test_model(svm_cv, df_test, tfidf)

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [126]:
# Remove Stopwords, Emojis, No Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, split_on='preprocessed', upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_svm_cv['All_but_stemming'] = test_model(svm_cv, df_test, tfidf)

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [127]:
results_svm_cv

Unnamed: 0,0,Only Tokenization,Remove Stopwords,Emojis,Upsampling,All_but_stemming
0,precision,0.915254,0.917012,0.921811,0.720358,0.740099
1,recall,0.482143,0.493304,0.5,0.71875,0.667411
2,accuracy,0.960582,0.961364,0.96199,0.960738,0.960269
3,F1,0.631579,0.641509,0.648336,0.719553,0.701878
