# Getting the data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

In [3]:
data_hate = pd.read_csv("../raw_data/hate-speech.csv")
data_fake = pd.read_csv("../raw_data/fake-news.csv")
data_gender = pd.read_csv("../raw_data/gender-bias.csv")
data_racial = pd.read_csv("../raw_data/racial-bias.csv")
data_political = pd.read_csv("../raw_data/political-bias.csv")
data_political = data_political.dropna()

# Cleaning

In [5]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase 
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
        
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
        
        tokenized_sentence = word_tokenize(sentence) ## tokenize 
        
        stop_words = set(stopwords.words('english')) ## define stopwords
        
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
        
        # Lemmatizing
        lemmatized_verbs = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
        lemmatized_nouns = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized_verbs]
        lemmatized_adj = [WordNetLemmatizer().lemmatize(word, pos = "a") for word in lemmatized_nouns]
        lemmatized_adv = [WordNetLemmatizer().lemmatize(word, pos = "r") for word in lemmatized_adj]
        
        cleaned_sentence = ' '.join(word for word in lemmatized_adv)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

In [6]:
from sklearn.preprocessing import FunctionTransformer

cleaner = FunctionTransformer(cleaning)

# 1. Hate Speech

***WARNING : only taking a subset of the hate speech data set in this notebook***

## 1.1 defining X, y, X_train, y_train, X_test, y_test

In [10]:
from sklearn.model_selection import train_test_split

X_full = data_hate['text']
y_full = data_hate['label']

X, _, y, _ = train_test_split(X_full, y_full, test_size=0.75, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)



# 1.2 creating pipeline and fitting pipeline

In [11]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# make a pipeline and fit it
cleaner = FunctionTransformer(cleaning)
vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=30, max_df=0.8)
model = LogisticRegression(solver='liblinear', penalty='l1', C=1)
pipe_hate = make_pipeline(cleaner, vectorizer, model)
pipe_hate.fit(X_train, y_train)

## 1.3 defining custom predictions to improve recall

In [12]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve

# Predict probabilities
y_pred_probas_0, y_pred_probas_1 = cross_val_predict(pipe_hate,
                                                     X_train, y_train,
                                                     method = "predict_proba", cv=3).T

# creating custom predictions to favor recall over precision
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_probas_1)

# Populate dataframe with recall and threshold
df_recall = pd.DataFrame({"recall" : recall[:-1], "threshold" : thresholds})

#Find out which threshold guarantees a recall of 0.9
new_threshold = df_recall[df_recall['recall'] >= 0.9]['threshold'].max()

new_threshold

0.310774856127122

In [13]:
def custom_predict(X, custom_threshold):
    probs = pipe_hate.predict_proba(X) # Get probability of each sample being classified as 0 or 1
    expensive_probs = probs[:, 1] # Only keep probabilities of class [1]
    return (expensive_probs > custom_threshold)

## 1.4  Evaluating the model

In [14]:
custom_predictions = custom_predict(X_test, new_threshold)

In [15]:
accuracy = accuracy_score(y_test, custom_predictions)
f1 = f1_score(y_test, custom_predictions)
recall = recall_score(y_test, custom_predictions)
precision = precision_score(y_test, custom_predictions)

print(f"Accuracy: {accuracy}", f"f1: {f1}", f"Recall: {recall}", f"Precision: {precision}")

Accuracy: 0.8557202189918822 f1: 0.8622786863089607 Recall: 0.9045368620037807 Precision: 0.8237927175690798


# 2. Fake News

### 2.1 cleaning X, y, X_train, y_train, X_test, y_test

In [117]:
X = data_fake['text']
y = data_fake['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [118]:
X_train_cleaned = cleaning(X_train)
X_test_cleaned = cleaning(X_test)

### 2.2 creating pipeline and fitting pipeline

In [119]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))
model = LogisticRegression(solver='liblinear',C=0.1)
pipe_fake = make_pipeline(vectorizer, model)
pipe_fake.fit(X_train_cleaned, y_train)

### 2.3 defining custom predictions to improve recall

In [120]:
# custom thresholds

# Predict probabilities
y_pred_probas_0, y_pred_probas_1 = cross_val_predict(pipe_fake,
                                                     X_train_cleaned, y_train,
                                                     method = "predict_proba", cv=3).T

# creating custom predictions to favor recall over precision
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_probas_1)

# Populate dataframe with recall and threshold
df_recall = pd.DataFrame({"recall" : recall[:-1], "threshold" : thresholds})

#Find out which threshold guarantees a recall of 0.7
new_threshold = df_recall[df_recall['recall'] >= 0.7]['threshold'].max()

new_threshold

0.490270718481461

In [121]:
def custom_predict(X, custom_threshold):
    probs = pipe_fake.predict_proba(X) # Get probability of each sample being classified as 0 or 1
    expensive_probs = probs[:, 1] # Only keep probabilities of class [1]
    return (expensive_probs > custom_threshold)

### 2.4  Evaluating the pipe

In [122]:
custom_predictions = custom_predict(X_test_cleaned, new_threshold)

In [123]:
accuracy = accuracy_score(y_test, custom_predictions)
f1 = f1_score(y_test, custom_predictions)
recall = recall_score(y_test, custom_predictions)
precision = precision_score(y_test, custom_predictions)

print(f"Accuracy: {accuracy}", f"f1: {f1}", f"Recall: {recall}", f"Precision: {precision}")

Accuracy: 0.6329588014981273 f1: 0.6611927398444253 Recall: 0.7176360225140713 Precision: 0.6129807692307693


### 2.5 pickle the pipe

In [124]:
import pickle

with open('pipe-fake.pkl', 'wb') as f:
    pickle.dump(pipe_fake, f)

# 3. Racial

### 3.1 cleaning X, y, X_train, y_train, X_test, y_test

In [125]:
X = data_racial['text']
y = data_racial['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_cleaned = cleaning(X_train)
X_test_cleaned = cleaning(X_test)


### 3.2 creating pipeline and fitting pipeline

In [126]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))
model = LogisticRegression(solver='liblinear',C=0.1)
pipe_racial = make_pipeline(vectorizer, model)
pipe_racial.fit(X_train_cleaned, y_train)

### 3.3 defining custom predictions to improve recall

In [127]:
# custom thresholds

# Predict probabilities
y_pred_probas_0, y_pred_probas_1 = cross_val_predict(pipe_racial,
                                                     X_train_cleaned, y_train,
                                                     method = "predict_proba", cv=4).T

# creating custom predictions to favor recall over precision
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_probas_1)

# Populate dataframe with recall and threshold
df_recall = pd.DataFrame({"recall" : recall[:-1], "threshold" : thresholds})

#Find out which threshold guarantees a recall of 0.75
new_threshold = df_recall[df_recall['recall'] >= 0.75]['threshold'].max()

new_threshold

0.49265919130920033

In [128]:
def custom_predict(X, custom_threshold):
    probs = pipe_racial.predict_proba(X) # Get probability of each sample being classified as 0 or 1
    expensive_probs = probs[:, 1] # Only keep probabilities of class [1]
    return (expensive_probs > custom_threshold)

### 3.4  Evaluating the model

In [129]:
custom_predictions = custom_predict(X_test_cleaned, new_threshold)

In [130]:
accuracy = accuracy_score(y_test, custom_predictions)
f1 = f1_score(y_test, custom_predictions)
recall = recall_score(y_test, custom_predictions)
precision = precision_score(y_test, custom_predictions)

print(f"Accuracy: {accuracy}", f"f1: {f1}", f"Recall: {recall}", f"Precision: {precision}")

Accuracy: 0.7057621577441765 f1: 0.7169811320754716 Recall: 0.7689713322091062 Precision: 0.6715758468335787


### 3.5 pickle the pipe

In [131]:
with open('pipe-racial.pkl', 'wb') as f_racial:
    pickle.dump(pipe_racial, f_racial)

# 4. Gender

### 4.1 cleaning X, y, X_train, y_train, X_test, y_test

In [132]:
X = data_gender['text']
y = data_gender['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_cleaned = cleaning(X_train)
X_test_cleaned = cleaning(X_test)

### 4.2 creating pipeline and fitting pipeline

In [133]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))
model = LogisticRegression(solver='liblinear',C=0.1)
pipe_gender = make_pipeline(vectorizer, model)
pipe_gender.fit(X_train_cleaned, y_train)

### 4.3 defining custom predictions to improve recall

In [139]:
# custom thresholds

# Predict probabilities
y_pred_probas_0, y_pred_probas_1 = cross_val_predict(pipe_gender,
                                                     X_train_cleaned, y_train,
                                                     method = "predict_proba", cv=4).T

# creating custom predictions to favor recall over precision
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_probas_1)

# Populate dataframe with recall and threshold
df_recall = pd.DataFrame({"recall" : recall[:-1], "threshold" : thresholds})

#Find out which threshold guarantees a recall of 0.85
new_threshold = df_recall[df_recall['recall'] >= 0.85]['threshold'].max()

new_threshold

0.5226123758498951

In [141]:
def custom_predict(X, custom_threshold):
    probs = pipe_gender.predict_proba(X) # Get probability of each sample being classified as 0 or 1
    expensive_probs = probs[:, 1] # Only keep probabilities of class [1]
    return (expensive_probs > custom_threshold)

### 4.4  Evaluating the model

In [142]:
custom_predictions = custom_predict(X_test_cleaned, new_threshold)

In [143]:
accuracy = accuracy_score(y_test, custom_predictions)
f1 = f1_score(y_test, custom_predictions)
recall = recall_score(y_test, custom_predictions)
precision = precision_score(y_test, custom_predictions)

print(f"Accuracy: {accuracy}", f"f1: {f1}", f"Recall: {recall}", f"Precision: {precision}")

Accuracy: 0.8 f1: 0.8103193064072743 Recall: 0.849290780141844 Precision: 0.774767488879903


### 4.5 pickle the pipe

In [145]:
with open('pipe-gender.pkl', 'wb') as f_gender:
    pickle.dump(pipe_racial, f_gender)

# 5. political

### 5.1 cleaning X, y, X_train, y_train, X_test, y_test

In [146]:
data_political = data_political.dropna()

X = data_political['text']
y = data_political['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_cleaned = cleaning(X_train)
X_test_cleaned = cleaning(X_test)

### 5.2 creating pipeline and fitting pipeline

In [147]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))
model = LogisticRegression(solver='liblinear',C=0.1)
pipe_political = make_pipeline(vectorizer, model)
pipe_political.fit(X_train_cleaned, y_train)

### 5.3 defining custom predictions to improve recall

In [148]:
# custom thresholds

# Predict probabilities
y_pred_probas_0, y_pred_probas_1 = cross_val_predict(pipe_political,
                                                     X_train_cleaned, y_train,
                                                     method = "predict_proba", cv=4).T

# creating custom predictions to favor recall over precision
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_probas_1)

# Populate dataframe with recall and threshold
df_recall = pd.DataFrame({"recall" : recall[:-1], "threshold" : thresholds})

#Find out which threshold guarantees a recall of 0.7
new_threshold = df_recall[df_recall['recall'] >= 0.7]['threshold'].max()

new_threshold

0.4959960254912499

In [149]:
def custom_predict(X, custom_threshold):
    probs = pipe_political.predict_proba(X) # Get probability of each sample being classified as 0 or 1
    expensive_probs = probs[:, 1] # Only keep probabilities of class [1]
    return (expensive_probs > custom_threshold)

### 5.4  Evaluating the model

In [150]:
custom_predictions = custom_predict(X_test_cleaned, new_threshold)

In [151]:
accuracy = accuracy_score(y_test, custom_predictions)
f1 = f1_score(y_test, custom_predictions)
recall = recall_score(y_test, custom_predictions)
precision = precision_score(y_test, custom_predictions)

print(f"Accuracy: {accuracy}", f"f1: {f1}", f"Recall: {recall}", f"Precision: {precision}")

Accuracy: 0.6863985539990962 f1: 0.6897630755476084 Recall: 0.6897630755476084 Precision: 0.6897630755476084


### 4.5 pickle the pipe

In [152]:
with open('pipe-political.pkl', 'wb') as f_political:
    pickle.dump(pipe_political, f_political)