In [54]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE




In [49]:
# Load cleaned dataset
df = pd.read_csv('cleaned_dataset.csv')



In [50]:
# Define a threshold for classifying gender bias based on scores or frequencies
# For example, you can use mean, median, or any other criteria based on your analysis
# Here, we'll assume a simple threshold for demonstration purposes

# Calculate mean or median of communal and agentic scores/frequencies
communal_mean = df['communal_freq'].mean()
agentic_mean = df['agentic_freq'].mean()

# Define a function to classify gender bias
def classify_gender_bias(row):
    # You can adjust this condition based on your analysis
    if row['communal_freq'] > communal_mean and row['agentic_freq'] > agentic_mean:
        return 1  # Indicates gender bias present
    else:
        return 0  # Indicates no gender bias or less bias

# Apply the function to create the gender_bias_label column
df['gender_bias_label'] = df.apply(classify_gender_bias, axis=1)

# If using frequencies instead of scores, adjust the condition accordingly:
# def classify_gender_bias(row):
#     if row['communal_freq'] > communal_mean and row['agentic_freq'] > agentic_mean:
#         return 1
#     else:
#         return 0

# Optionally, you can use different thresholds or more complex criteria based on your analysis.
# This simple example assumes that higher than mean scores indicate bias.


In [55]:
# Assuming X and y are defined as per your dataset structure
X = df[['communal_freq', 'agentic_freq', 'seniority_level', 'sentiment', 'flesch_reading_ease']]
y = df['gender_bias_label']

# Define preprocessing steps including handling categorical variables
column_transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(), ['seniority_level'])  # Encode 'seniority_level'
], remainder='passthrough')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to resample the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('preprocess', column_transformer),
    ('classifier', LogisticRegression())  # Example classifier, replace with your choice
])

# Define parameters for grid search
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2']
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Evaluate best model on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

ValueError: could not convert string to float: 'mid'

In [53]:

# Assuming X and y are defined as per your dataset structure
X = df[['communal_freq', 'agentic_freq', 'seniority_level', 'sentiment', 'flesch_reading_ease']]
y = df['gender_bias_label']

# Define preprocessing steps including handling categorical variables
column_transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(), ['seniority_level'])  # Encode 'seniority_level'
], remainder='passthrough')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipeline with preprocessing and classifier
pipeline = Pipeline([
    ('preprocess', column_transformer),
    ('smote', SMOTE(random_state=42)),  # Apply SMOTE after preprocessing
    ('classifier', LogisticRegression())  # Example classifier, replace with your choice
])

# Define parameters for grid search
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2']
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate best model on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 6 candidates, totalling 30 fits


ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/ds-venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/ds-venv/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/ds-venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/ds-venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 388, in _fit
    self._validate_steps()
  File "/opt/anaconda3/envs/ds-venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 258, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=42)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't


In [None]:
# Select features and target
X = df[['communal_score', 'agentic_score', 'seniority_level', 'sentiment', 'flesch_reading_ease']]
y = df['gender_bias_label']  # Assuming you have a label for gender bias (binary classification)

# Handle categorical variables (seniority_level)
column_transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(), ['seniority_level'])
], remainder='passthrough')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipeline for preprocessing and model training
pipeline = Pipeline([
    ('preprocess', column_transformer),
    ('classifier', LogisticRegression())  # Example classifier, replace with your choice
])

# Define parameters for grid search (example)
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2']
}


# Assuming X_train and y_train are your training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define your pipeline with Logistic Regression (example)
pipeline = Pipeline([
    ('classifier', LogisticRegression())
])

# Define parameter grid for GridSearchCV (example)
param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__solver': ['liblinear', 'lbfgs']
}

# Perform grid search with resampled data
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Evaluate best model on test data
best_model = grid_search.best_estimator_

In [None]:

# Perform grid search to find best parameters
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate best model on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:

# Save the best model for deployment
import joblib
joblib.dump(best_model, 'gender_bias_detection_model.pkl')

# Example of how to load the model later
# loaded_model = joblib.load('gender_bias_detection_model.pkl')