In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

# 1. Data

In [2]:
data = pd.read_csv("../raw_data/hate-speech.csv")

In [3]:
from sklearn.model_selection import train_test_split

X_full = data['text']
y_full = data['label']

X, _, y, _ = train_test_split(X_full, y_full, test_size=0.75, random_state=42)

# Train / test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase 
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
        
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
        
        tokenized_sentence = word_tokenize(sentence) ## tokenize 
        
        stop_words = set(stopwords.words('english')) ## define stopwords
        
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
        
        # Lemmatizing
        lemmatized_verbs = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
        lemmatized_nouns = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized_verbs]
        lemmatized_adj = [WordNetLemmatizer().lemmatize(word, pos = "a") for word in lemmatized_nouns]
        lemmatized_adv = [WordNetLemmatizer().lemmatize(word, pos = "r") for word in lemmatized_adj]
        
        cleaned_sentence = ' '.join(word for word in lemmatized_adv)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

In [15]:
from sklearn.preprocessing import FunctionTransformer

cleaner = FunctionTransformer(cleaning)

In [20]:
X_cleaned = cleaner.fit_transform(X)

KeyboardInterrupt: 

# Gridsearching

In [29]:
X.shape
X_cleaned = cleaner.fit_transform(X)
X_cleaned.shape

(84752,)

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline_logreg = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,1), min_df=30, max_df=0.8)),
    ('logreg', LogisticRegression(solver='liblinear', penalty='l1')),
])

# Set parameters to search
parameters = {
    #'tfidf__ngram_range': ((1,1),),
    #'tfidf__min_df': (30,30),
    #'tfidf__max_df': (0.75,0.8),
    #'logreg__penalty': ('l1', 'l2'),
    'logreg__C': (0.5, 1, 5)
    #'logreg__solver': ('liblinear', 'newton-cg', 'lbfgs')
}

# Perform grid search on pipeline
grid_search = GridSearchCV(
    pipeline_logreg, parameters, n_jobs=-1, 
    verbose=1, scoring = "accuracy", 
    cv=4
)

grid_search.fit(X_cleaned,y)



Fitting 4 folds for each of 3 candidates, totalling 12 fits


In [42]:
best_model = grid_search.best_estimator_
best_params = best_model.get_params()
best_score = grid_search.best_score_
#print(f"best ngram:{best_params['tfidf__ngram_range']} ; best mindf: {best_params['tfidf__min_df']}; best maxdf: {best_params['tfidf__max_df']} best npalpha: {best_params['nb__alpha']}")
#wiht an accuracy score of {best_score}")
print(best_params)
print(best_score)

{'memory': None, 'steps': [('tfidf', TfidfVectorizer(max_df=0.8, min_df=30)), ('logreg', LogisticRegression(C=1, penalty='l1', solver='liblinear'))], 'verbose': False, 'tfidf': TfidfVectorizer(max_df=0.8, min_df=30), 'logreg': LogisticRegression(C=1, penalty='l1', solver='liblinear'), 'tfidf__analyzer': 'word', 'tfidf__binary': False, 'tfidf__decode_error': 'strict', 'tfidf__dtype': <class 'numpy.float64'>, 'tfidf__encoding': 'utf-8', 'tfidf__input': 'content', 'tfidf__lowercase': True, 'tfidf__max_df': 0.8, 'tfidf__max_features': None, 'tfidf__min_df': 30, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2', 'tfidf__preprocessor': None, 'tfidf__smooth_idf': True, 'tfidf__stop_words': None, 'tfidf__strip_accents': None, 'tfidf__sublinear_tf': False, 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__tokenizer': None, 'tfidf__use_idf': True, 'tfidf__vocabulary': None, 'logreg__C': 1, 'logreg__class_weight': None, 'logreg__dual': False, 'logreg__fit_intercept': True, 'logreg__intercept_s

In [43]:
pipeline_logreg = Pipeline([
    ('cleaner', cleaner),
    ('tfidf', TfidfVectorizer(ngram_range=(1,1), min_df=30, max_df=0.8)),
    ('logreg', LogisticRegression(solver='liblinear', penalty='l1', C=1)),
])

In [44]:
pipeline_logreg.fit(X_train,y_train)

In [45]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

predictions = pipeline_logreg.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
recall = recall_score(y_test, predictions)
precision = precision_score(y_test, predictions)

print("Accuracy:", accuracy)
print("F1:", f1)
print("recall", recall)
print("precision", precision)

Accuracy: 0.8528412308854069
F1: 0.8463282405125677
recall 0.811531190926276
precision 0.8842430484037075
