In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

In [9]:
data_racial = pd.read_csv("../raw_data/racial-bias.csv")

In [10]:
from sklearn.model_selection import train_test_split

X = data_racial['text']
y = data_racial['label']
    
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase 
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
        
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
        
        tokenized_sentence = word_tokenize(sentence) ## tokenize 
        
        stop_words = set(stopwords.words('english')) ## define stopwords
        
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
        
        # Lemmatizing
        lemmatized_verbs = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
        lemmatized_nouns = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized_verbs]
        lemmatized_adj = [WordNetLemmatizer().lemmatize(word, pos = "a") for word in lemmatized_nouns]
        lemmatized_adv = [WordNetLemmatizer().lemmatize(word, pos = "r") for word in lemmatized_adj]
        
        cleaned_sentence = ' '.join(word for word in lemmatized_adv)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

In [12]:
from sklearn.preprocessing import FunctionTransformer
cleaner = FunctionTransformer(cleaning)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))

In [15]:
from sklearn.ensemble import VotingClassifier

class CustomVotingClassifier(VotingClassifier):
    def predict(self, X):
        # Get the individual model predictions
        predictions = super().predict(X)

        # Check if any model predicts 1
        if any(prediction == 1 for prediction in predictions):
            # Set all predictions to 1
            predictions = [1] * len(predictions)

        return predictions

In [28]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score

# Define the base classification models
gboost = GradientBoostingClassifier(n_estimators=100)
logreg = LogisticRegression(solver='liblinear', C=0.1)
adaboost = AdaBoostClassifier()

# Create the ensemble classifier
model = VotingClassifier(
    estimators=[("gboost", gboost), ("adaboost", adaboost), ("logreg", logreg)],
    voting='soft',  # Use soft voting for probabilistic classification models
    weights=[1, 1, 1],  # Equal weights for all models
    n_jobs=-1
)

# Create the pipeline
pipe_ensemble = make_pipeline(cleaner, vectorizer, model)

# Fit the pipeline to the training data
pipe_ensemble.fit(X_train, y_train)

# Make predictions on the test data
probabilities = pipe_ensemble.predict_proba(X_test)
threshold = 0.55  # Adjust the threshold as needed
class_predictions = (probabilities[:, 1] >= threshold).astype(int)

# Evaluate the performance of the ensemble model
accuracy = accuracy_score(y_test, class_predictions)
recall = recall_score(y_test, class_predictions)
precision = precision_score(y_test, class_predictions)


In [29]:
print(accuracy)
print(recall)
print(precision)

0.6885982836125868
0.5059021922428331
0.7731958762886598


In [22]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier

# Define the base classification models
gboost = GradientBoostingClassifier(n_estimators=100)
logreg = LogisticRegression(solver='liblinear',C=0.1)
adaboost = AdaBoostClassifier()
randomforest = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the ensemble classifier
model = VotingClassifier(
    estimators=[("gboost", gboost), ("adaboost", adaboost),("logreg", logreg),("randomforest",randomforest)],
    voting='soft',  # Use soft voting for probabilistic classification models
    weights=[1, 1, 1],  # Equal weights for all models
    n_jobs=-1
)

# Create the pipeline
pipe_ensemble = make_pipeline(cleaner, vectorizer, model)

# Fit the pipeline to the training data
pipe_ensemble.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipe_ensemble.predict(X_test)

# Evaluate the accuracy of the ensemble model
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred )
precision= precision_score(y_test, y_pred)


In [23]:
print(accuracy)
print(recall)
print(precision)

0.4846751123825092
1.0
0.4846751123825092


In [None]:
print(accuracy)
print(recall)
print(precision)

In [41]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score

# Define the base classification models
gboost = GradientBoostingClassifier(n_estimators=100)
logreg = LogisticRegression(solver='liblinear', C=0.1)
adaboost = AdaBoostClassifier()
randomforest = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the ensemble classifier with customized voting
class CustomVotingClassifier(VotingClassifier):
    def predict(self, X):
        probabilities = self.predict_proba(X)
        class_predictions = []
        for probas in probabilities:
            if (probas >= 0.55).sum() >= 1:  # If at least two models predict 1
                class_predictions.append(1)
            else:
                class_predictions.append(0)
        return class_predictions

# Create the customized ensemble classifier
model = CustomVotingClassifier(
    estimators=[("gboost", gboost), ("adaboost", adaboost), ("logreg", logreg), ("randomforest", randomforest)],
    voting='soft',  # Use soft voting for probabilistic classification models
    weights=[1, 1, 1, 1],  # Equal weights for all models
    n_jobs=-1
)

# Create the pipeline
pipe_ensemble = make_pipeline(cleaner, vectorizer, model)

# Fit the pipeline to the training data
pipe_ensemble.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipe_ensemble.predict(X_test)

# Evaluate the accuracy of the ensemble model
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)


In [42]:
print(accuracy)
print(recall)
print(precision)

0.5026563138536984
0.7495784148397976
0.49143173023770037
