In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

In [91]:
data = pd.read_csv("../raw_data/hate-speech.csv")

In [93]:
from sklearn.model_selection import train_test_split

X_full = data['text']
y_full = data['label']

X, _, y, _ = train_test_split(X_full, y_full, test_size=0.75, random_state=42)

In [94]:
print(X.shape)
print(y.shape)

(84752,)
(84752,)


# 1. Cleaning

## 1.1 Key considerations

**Stop words**: just the regular list added - possiblity to customize to add or withdraw some words (see below)  
**lemmatizing**: you can lemmatize at different levels (nouns, verbs, adjectives, adverbs). I have added all options, but we might consider doing it differently

## 1.2 Cleaning function

In [95]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase 
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
        
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
        
        tokenized_sentence = word_tokenize(sentence) ## tokenize 
        
        stop_words = set(stopwords.words('english')) ## define stopwords
        
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
        
        # Lemmatizing
        lemmatized_verbs = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
        lemmatized_nouns = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized_verbs]
        lemmatized_adj = [WordNetLemmatizer().lemmatize(word, pos = "a") for word in lemmatized_nouns]
        lemmatized_adv = [WordNetLemmatizer().lemmatize(word, pos = "r") for word in lemmatized_adj]
        
        cleaned_sentence = ' '.join(word for word in lemmatized_adv)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

In [None]:
# Add custom stopwords
"""
custom_stop_words = ['custom', 'stop', 'words']
stop_words.update(custom_stop_words)

# Remove specific stopwords
remove_stop_words = ['not', 'no']
stop_words.difference_update(remove_stop_words)
"""

In [102]:
# testing the function with a simple example

example = ["mY Cats are so lovely in the skies with clouds funiest ever.",
           "i am doing this for the 400 time in row.",
           "cats dogs people are unbelievable"]

example_series = pd.Series(example)
output = cleaning(example_series)
output

0    cat lovely sky cloud funiest ever
1                             time row
2          cat dog people unbelievable
dtype: object

# 2. Gridsearching the best vectorizer / model

tested multiple things, and appears ngram (1,1) better than (2,2) or (3,3)
min df seems to be 30


In [113]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
#from sklearn import set_config; set_config("diagram")

"""
# ngram ((1,1), (2,2), (3,3), (2,3)),
# min_df: 5, 10, 30, 50
# max_df: 0.75, 0.85, 0.95
# nalpha: 0.01, 0.1, 1, 10

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1),),
    'tfidf__min_df': (30,30),
    'tfidf__max_df': (0.75,0.8),
    'nb__alpha': (5,10)
}

# Perform grid search on pipeline
grid_search = GridSearchCV(
    pipeline, parameters, n_jobs=-1, 
    verbose=1, scoring = "accuracy", 
    cv=4
)

grid_search.fit(X,y)
""" 
# to be updated also including the custom cleaning fucntion before vecotriing

'\n# ngram ((1,1), (2,2), (3,3), (2,3)),\n# min_df: 5, 10, 30, 50\n# max_df: 0.75, 0.85, 0.95\n# nalpha: 0.01, 0.1, 1, 10\n\n# Create Pipeline\npipeline = Pipeline([\n    (\'tfidf\', TfidfVectorizer()),\n    (\'nb\', MultinomialNB()),\n])\n\n# Set parameters to search\nparameters = {\n    \'tfidf__ngram_range\': ((1,1),),\n    \'tfidf__min_df\': (30,30),\n    \'tfidf__max_df\': (0.75,0.8),\n    \'nb__alpha\': (5,10)\n}\n\n# Perform grid search on pipeline\ngrid_search = GridSearchCV(\n    pipeline, parameters, n_jobs=-1, \n    verbose=1, scoring = "accuracy", \n    cv=4\n)\n\ngrid_search.fit(X,y)\n'

In [45]:
best_model = grid_search.best_estimator_
best_params = best_model.get_params()
best_score = grid_search.best_score_
print(f"best ngram:{best_params['tfidf__ngram_range']} ; best mindf: {best_params['tfidf__min_df']}; best maxdf: {best_params['tfidf__max_df']} best npalpha: {best_params['nb__alpha']}")
#wiht an accuracy score of {best_score}")

best ngram:(1, 1) ; best mindf: 30; best maxdf: 0.75 best npalpha: 5


# 3. Pipeline creation

In [106]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [107]:
print(X_train.shape)
print(y_train.shape)

(63564,)
(63564,)


In [109]:
from sklearn.preprocessing import FunctionTransformer

cleaner = FunctionTransformer(cleaning)

In [114]:
vectorizer = TfidfVectorizer(min_df=30, max_df=0.8, ngram_range=(1, 1))

In [115]:
model = MultinomialNB(alpha=5)

In [116]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(cleaner, vectorizer, model)

pipeline.fit(X_train, y_train)

In [117]:
from sklearn.metrics import accuracy_score

predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.79493109307155


# 4. Random predictions

In [129]:
X_real = pd.Series(data['text'][600])
y_real = pd.Series(data['label'][600])
print(X_real)
print(y_real)

0    Switched to a septic tank 
dtype: object
0    0
dtype: int64


In [130]:
predict_example = pipeline.predict(X_real)
predict_example

array([0])

# 5 Uploading

In [140]:
import pickle

with open("pipeline.pkl","wb") as file:
    pickle.dump(pipeline, file)

In [138]:
import bz2
with bz2.BZ2File("/home/albane/code/Ancastal/fake-news/raw_data/hate_speech",'rb') as ifile:
    df = pickle.load(ifile)

In [139]:
df

Unnamed: 0,text,label
0,comment like pelosi big part reason people tir...,0
1,caught meth say,0
2,byron york got exactly correct democrat want d...,0
3,wish goal animal right activist people strt re...,0
4,censored civil gestapo hate left shock else on...,0
...,...,...
339005,clearly view someone never served day branch u...,1
339006,never state project recall city ever asked sta...,1
339007,abernathy admire trump attack ag jefferson bea...,1
339008,inept governor puerto rico presidential appoin...,1
