In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

In [67]:
data_hate = pd.read_csv("../raw_data/hate-speech.csv")
data_cognitive = pd.read_csv("../raw_data/cognitive-bias.csv")
data_fake = pd.read_csv("../raw_data/fake-news.csv")
data_gender = pd.read_csv("../raw_data/gender-bias.csv")
data_linguistic = pd.read_csv("../raw_data/linguistic-bias.csv")
data_racial = pd.read_csv("../raw_data/racial-bias.csv")
data_political = pd.read_csv("../raw_data/political-bias.csv")
data_political = data_political.dropna()
data_textlevel = pd.read_csv("../raw_data/text-level-bias.csv")

In [68]:
dict_ds = {
    'ds-hate': data_hate,
    'ds-cognitive': data_cognitive,
    'ds-fake': data_fake, 
    'ds-political':data_political, 
    'ds-gender': data_gender, 
    'ds-linguistic': data_linguistic, 
    'ds-text': data_textlevel, 
    'ds-racial': data_racial
}

In [69]:
for ds_name, ds in dict_ds.items():
    print(f"{ds_name} : {ds.shape[0]} rows")

ds-hate : 339010 rows
ds-cognitive : 7092 rows
ds-fake : 8542 rows
ds-political : 17703 rows
ds-gender : 17940 rows
ds-linguistic : 401862 rows
ds-text : 9018 rows
ds-racial : 9788 rows


In [70]:
dict_ds_small = {
    'ds-cognitive': data_cognitive,
    'ds-fake': data_fake, 
    'ds-political':data_political, 
    'ds-gender': data_gender, 
    'ds-text': data_textlevel, 
    'ds-racial': data_racial
}

In [75]:
dict_ds_large = {
    'ds-hate': data_hate,
    'ds-linguistic': data_linguistic, 
}

# 1. Cleaning

In [71]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase 
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
        
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
        
        tokenized_sentence = word_tokenize(sentence) ## tokenize 
        
        stop_words = set(stopwords.words('english')) ## define stopwords
        
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
        
        # Lemmatizing
        lemmatized_verbs = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
        lemmatized_nouns = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized_verbs]
        lemmatized_adj = [WordNetLemmatizer().lemmatize(word, pos = "a") for word in lemmatized_nouns]
        lemmatized_adv = [WordNetLemmatizer().lemmatize(word, pos = "r") for word in lemmatized_adj]
        
        cleaned_sentence = ' '.join(word for word in lemmatized_adv)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

# 2. Logistic Reg

### 2.2.1 small datasets

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import FunctionTransformer

dict_perf = {}

for ds_name, ds in dict_ds_small.items():
    
    # defining X and y
    X = ds['text']
    y = ds['label']
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    # make a pipeline and fit it
    cleaner = FunctionTransformer(cleaning)
    vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))
    model = LogisticRegression(solver='liblinear',C=0.1)
    pipeline = make_pipeline(cleaner, vectorizer, model)
    pipeline.fit(X_train, y_train)
    
    # evaluate the pipeline
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    dict_perf[ds_name] = round(accuracy*100,1)
    
    print(f"{ds_name}: accuracy of {round(accuracy*100,1)}%")

print(dict_perf)


ds-cognitive: accuracy of 58.4%
ds-fake: accuracy of 62.8%
ds-political: accuracy of 68.4%
ds-gender: accuracy of 80.2%
ds-text: accuracy of 69.0%
ds-racial: accuracy of 70.5%
{'ds-cognitive': 58.4, 'ds-fake': 62.8, 'ds-political': 68.4, 'ds-gender': 80.2, 'ds-text': 69.0, 'ds-racial': 70.5}


### 2.2.2 Large datasets

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

dict_perf_large = {}


for ds_name, ds in dict_ds_large.items():
    
    # defining X and y and taking a subset of the full dataset
    X_full = ds['text']
    y_full = ds['label']
    X, _, y, _ = train_test_split(X_full, y_full, test_size=0.75, random_state=42)
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    # make a pipeline and fit it
    cleaner = FunctionTransformer(cleaning)
    vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=30, max_df=0.8)
    model = LogisticRegression(solver='liblinear', penalty='l1', C=1)
    pipeline = make_pipeline(cleaner, vectorizer, model)
    pipeline.fit(X_train, y_train)
    
    # evaluate the pipeline
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    dict_perf[ds_name] = round(accuracy*100,1)
    
    print(f"{ds_name}: accuracy of {round(accuracy*100,1)}%")

ds-hate: accuracy of 85.3%
ds-linguistic: accuracy of 62.0%


# 3. Naive Bayes

## 3.1 small datasets

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB

for ds_name, ds in dict_ds_small.items():
    
    # defining X and y
    X = ds['text']
    y = ds['label']
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    # make a pipeline and fit it
    cleaner = FunctionTransformer(cleaning)
    vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))
    model = MultinomialNB(alpha=10)
    pipeline = make_pipeline(cleaner, vectorizer, model)
    pipeline.fit(X_train, y_train)
    
    # evaluate the pipeline
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print(f"{ds_name}: accuracy of {round(accuracy*100,1)}%")    

ds-cognitive: accuracy of 58.0%
ds-fake: accuracy of 62.7%
ds-political: accuracy of 70.0%
ds-gender: accuracy of 78.1%
ds-text: accuracy of 66.8%
ds-racial: accuracy of 70.6%


### 3.2 large datasets

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

for ds_name, ds in dict_ds_large.items():
    
    # defining X and y and taking a subset of the full dataset
    X_full = ds['text']
    y_full = ds['label']
    X, _, y, _ = train_test_split(X_full, y_full, test_size=0.75, random_state=42)
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    # make a pipeline and fit it
    cleaner = FunctionTransformer(cleaning)
    vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=30, max_df=0.8)
    model = MultinomialNB(alpha=10)
    pipeline = make_pipeline(cleaner, vectorizer, model)
    pipeline.fit(X_train, y_train)
    
    # evaluate the pipeline
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
           
    print(f"{ds_name}: accuracy of {round(accuracy*100,1)}%")

ds-hate: accuracy of 79.5%
ds-linguistic: accuracy of 58.4%
