In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import string
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = stopwords.words('english')

In [None]:
dataset = load_dataset("mediabiasgroup/mbib-base", "hate-speech")
df = dataset['train'].to_pandas()
df.to_pickle('data.pkl')

In [None]:
import string
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        the same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip()  # remove whitespaces
        sentence = sentence.lower()  # lowercase
        sentence = ''.join(char for char in sentence if not char.isdigit())  # remove numbers
        
        # Advanced cleaning
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
        
        tokenized_sentence = word_tokenize(sentence)  # tokenize 
        
        stop_words = set(stopwords.words('english'))  # define stopwords
        
        tokenized_sentence_cleaned = [w for w in tokenized_sentence if w not in stop_words]  # remove stopwords
        
        # Lemmatizing
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in tokenized_sentence_cleaned]
        
        cleaned_sentence = ' '.join(lemmatized_words)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

In [None]:
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

vectorizer = TfidfVectorizer(min_df=30, max_df=0.8, ngram_range=(1, 1))
cleaner = FunctionTransformer(cleaning)

models = {
    'LogisticRegression': LogisticRegression(max_iter=2000),
    'RandomForestClassifier': RandomForestClassifier(n_jobs=-1),
    'SVC': SVC(max_iter=2000),
    'KNeighborsClassifier': KNeighborsClassifier(n_jobs=-1),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    }

X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
for model in models.values():
    print('Building model: ', model)
    pipe = make_pipeline(cleaner, vectorizer, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    print(model)
    print(classification_report(y_val, y_pred))
    print(f'{model} has accuracy equal to: {accuracy_score(y_val, y_pred)}')
    print('---------------------------')

In [None]:
model = LogisticRegression(max_iter=2000)
pipe = make_pipeline(cleaner, vectorizer, model)