In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

In [54]:
data = pd.read_csv("../raw_data/hate-speech.csv")

In [55]:
from sklearn.model_selection import train_test_split

X_full = data['text']
y_full = data['label']

X, _, y, _ = train_test_split(X_full, y_full, test_size=0.75, random_state=42)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [57]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase 
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
        
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
        
        tokenized_sentence = word_tokenize(sentence) ## tokenize 
        
        stop_words = set(stopwords.words('english')) ## define stopwords
        
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
        
        # Lemmatizing
        lemmatized_verbs = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
        lemmatized_nouns = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized_verbs]
        lemmatized_adj = [WordNetLemmatizer().lemmatize(word, pos = "a") for word in lemmatized_nouns]
        lemmatized_adv = [WordNetLemmatizer().lemmatize(word, pos = "r") for word in lemmatized_adj]
        
        cleaned_sentence = ' '.join(word for word in lemmatized_adv)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

In [58]:
from sklearn.preprocessing import FunctionTransformer

cleaner = FunctionTransformer(cleaning)

In [61]:
X_cleaned = cleaner.fit_transform(X)

In [59]:
X_train_cleaned = cleaner.fit_transform(X_train)

In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

pipeline_logreg = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,1), min_df=30, max_df=0.8)),
    ('random_forest', RandomForestClassifier(max_depth=10, min_samples_split=25))
])

# Set parameters to searchfrom sklearn.linear_mode
parameters = {
    'random_forest__n_estimators': (300, 200, 400)
    #'random_forest__max_depth': (15,20),
    #'random_forest__min_samples_split': (25,35)
}

# Perform grid search on pipeline
grid_search = GridSearchCV(
    pipeline_logreg, parameters, n_jobs=-1, 
    verbose=1, scoring = "accuracy", 
    cv=4
)

grid_search.fit(X_train_cleaned, y_train)


Fitting 4 folds for each of 3 candidates, totalling 12 fits


In [80]:
best_model = grid_search.best_estimator_
best_params = best_model.get_params()
best_score = grid_search.best_score_
#print(f"best ngram:{best_params['tfidf__ngram_range']} ; best mindf: {best_params['tfidf__min_df']}; best maxdf: {best_params['tfidf__max_df']} best npalpha: {best_params['nb__alpha']}")
#wiht an accuracy score of {best_score}")
print(best_params)
print(best_score)

{'memory': None, 'steps': [('tfidf', TfidfVectorizer(max_df=0.8, min_df=30)), ('random_forest', RandomForestClassifier(max_depth=10, min_samples_split=25, n_estimators=300))], 'verbose': False, 'tfidf': TfidfVectorizer(max_df=0.8, min_df=30), 'random_forest': RandomForestClassifier(max_depth=10, min_samples_split=25, n_estimators=300), 'tfidf__analyzer': 'word', 'tfidf__binary': False, 'tfidf__decode_error': 'strict', 'tfidf__dtype': <class 'numpy.float64'>, 'tfidf__encoding': 'utf-8', 'tfidf__input': 'content', 'tfidf__lowercase': True, 'tfidf__max_df': 0.8, 'tfidf__max_features': None, 'tfidf__min_df': 30, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2', 'tfidf__preprocessor': None, 'tfidf__smooth_idf': True, 'tfidf__stop_words': None, 'tfidf__strip_accents': None, 'tfidf__sublinear_tf': False, 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__tokenizer': None, 'tfidf__use_idf': True, 'tfidf__vocabulary': None, 'random_forest__bootstrap': True, 'random_forest__ccp_alpha': 0.0, '