In [25]:
# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

In [26]:
# Sample DataFrame and target for illustration (replace with your actual data)
X_train = pd.read_csv('model_data.csv')
y_train = X_train.pop('Gender_Bias')  # Assuming the target column is separated


In [45]:
numerical_features = [
    'Rating', 'Founded', 'hourly', 'employer_provided', 'min_salary', 
    'max_salary', 'avg_salary', 'same_state', 'age', 'python_yn', 
    'R_yn', 'spark', 'aws', 'excel', 'desc_len', 'num_comp', 
    'Agentic_Count', 'Communal_Count', 'Gendered_Ratio', 
    'job_state_encoded', 'headquarters_state_encoded', 'Type of ownership_encoded',
    'Industry_encoded', 'Sector_encoded', 'job_simp_encoded', 
    'seniority_encoded', 'num_comp_encoded'
]

categorical_features = [
    'job_state_encoded', 'headquarters_state_encoded', 'Type of ownership_encoded', 'Industry_encoded',
    'Sector_encoded', 'job_simp_encoded', 'seniority_encoded', 'num_comp_encoded'
]


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Define important features
important_features = [
    'desc_len', 'max_salary', 'avg_salary', 'Gendered_Ratio', 'min_salary',
    'Communal_Count', 'Agentic_Count', 'Founded', 'age', 'Rating',
    'job_state_encoded', 'Industry_encoded', 'headquarters_state_encoded',
    'Sector_encoded', 'job_simp_encoded', 'Type of ownership_encoded'
]

# Define numerical and categorical features
numerical_features = [
    'Rating', 'Founded', 'hourly', 'employer_provided', 'min_salary', 
    'max_salary', 'avg_salary', 'same_state', 'age', 'python_yn', 
    'R_yn', 'spark', 'aws', 'excel', 'desc_len', 'num_comp', 
    'Agentic_Count', 'Communal_Count', 'Gendered_Ratio', 
    'job_state_encoded', 'headquarters_state_encoded', 'Type of ownership_encoded',
    'Industry_encoded', 'Sector_encoded', 'job_simp_encoded', 
    'seniority_encoded', 'num_comp_encoded'
]

categorical_features = [
    'job_state_encoded', 'headquarters_state_encoded', 'Type of ownership_encoded',
    'Industry_encoded', 'Sector_encoded', 'job_simp_encoded', 'seniority_encoded', 
    'num_comp_encoded'
]

# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define SMOTE
smote = SMOTE(random_state=42)

# Define pipeline with SMOTE
pipeline_smote = imPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100))
])

# Fit pipeline with SMOTE
pipeline_smote.fit(X_train[important_features], y_train)

# Evaluate the pipeline
y_pred = pipeline_smote.predict(X_test[important_features])

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:

[[32 36]
 [40 41]]

The matrix is a 2x2 array showing the counts of true positives, true negatives, false positives, and false negatives:
- True Positives (TP): 41 (correctly predicted positive samples)
- True Negatives (TN): 32 (correctly predicted negative samples)
- False Positives (FP): 36 (incorrectly predicted positive samples)
- False Negatives (FN): 40 (incorrectly predicted negative samples)

Classification Report:

Precision: The ratio of correctly predicted positive observations to the total predicted positives. For class 0, it's 0.44; for class 1, it's 0.53.

Recall: The ratio of correctly predicted positive observations to all observations in the actual class. For class 0, it's 0.47; for class 1, it's 0.51.

F1-Score: The weighted average of Precision and Recall. It considers both false positives and false negatives. For class 0, it's 0.46; for class 1, it's 0.52.

Support: The number of actual occurrences of the class in the dataset.

Overall Accuracy:

The model's accuracy is 0.49, which means it correctly predicts the class 49% of the time.

In [47]:
# Fit the model to the entire dataset
pipeline.fit(X_train, y_train)

# Extract feature importances
importances = pipeline.named_steps['classifier'].feature_importances_

# Combine feature names with importances
feature_names = (pipeline.named_steps['preprocessor']
                    .transformers_[0][1].get_feature_names_out().tolist() + 
                    pipeline.named_steps['preprocessor']
                    .transformers_[1][1].get_feature_names_out().tolist())
feature_importance_dict = dict(zip(feature_names, importances))

# Sort and display feature importances
sorted_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Feature Importances:")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance:.4f}")

Feature Importances:
desc_len: 0.0558
max_salary: 0.0489
avg_salary: 0.0475
Gendered_Ratio: 0.0467
min_salary: 0.0467
Communal_Count: 0.0451
Agentic_Count: 0.0423
Founded: 0.0413
age: 0.0390
Rating: 0.0367
job_state_encoded: 0.0292
Industry_encoded: 0.0268
headquarters_state_encoded: 0.0256
Sector_encoded: 0.0248
job_simp_encoded: 0.0202
Type of ownership_encoded: 0.0134
excel: 0.0113
same_state: 0.0105
python_yn: 0.0102
job_simp_encoded_1: 0.0096
aws: 0.0095
num_comp_encoded: 0.0093
job_simp_encoded_2: 0.0093
num_comp: 0.0092
seniority_encoded_1: 0.0091
Type of ownership_encoded_2: 0.0089
seniority_encoded: 0.0086
job_state_encoded_2: 0.0085
Sector_encoded_13: 0.0085
num_comp_encoded_0: 0.0083
Type of ownership_encoded_3: 0.0081
spark: 0.0080
seniority_encoded_2: 0.0077
Sector_encoded_6: 0.0066
job_simp_encoded_6: 0.0059
headquarters_state_encoded_23: 0.0058
job_state_encoded_16: 0.0055
headquarters_state_encoded_6: 0.0054
num_comp_encoded_3: 0.0054
Sector_encoded_10: 0.0048
job_simp_

In [30]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for RandomForestClassifier
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_split, y_train_split)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Predict with the best model
y_pred_best = grid_search.predict(X_valid_split)

# Evaluate the best model
print("Confusion Matrix:")
print(confusion_matrix(y_valid_split, y_pred_best))

print("\nClassification Report:")
print(classification_report(y_valid_split, y_pred_best))


Best parameters found:  {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best score:  0.5463324312775959
Confusion Matrix:
[[35 33]
 [43 38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.51      0.48        68
           1       0.54      0.47      0.50        81

    accuracy                           0.49       149
   macro avg       0.49      0.49      0.49       149
weighted avg       0.50      0.49      0.49       149



Best Parameters Found
max_depth: 10
min_samples_leaf: 1
min_samples_split: 5
n_estimators: 100

Best Score
Accuracy: 0.5463 (approximately 54.63%)

Updated Confusion Matrix

[[35 33]
 [43 38]]

    True Positives (TP): 38
    True Negatives (TN): 35
    False Positives (FP): 33
    False Negatives (FN): 43

Updated Classification Report

Precision:

    Class 0: 0.45
    Class 1: 0.54

Recall:

    Class 0: 0.51
    Class 1: 0.47

F1-Score:

    Class 0: 0.48
    Class 1: 0.50

Overall Accuracy: 0.49

Observations:

Precision for Class 1 improved slightly, but recall decreased.

F1-scores are relatively close for both classes, indicating a balance between precision and recall.

### Addressing Data Imbalance

In [44]:
important_features = [
    'desc_len',
    'max_salary',
    'avg_salary',
    'Gendered_Ratio',
    'min_salary',
    'Communal_Count',
    'Agentic_Count',
    'Founded',
    'age',
    'Rating',
    'job_state_encoded',
    'Industry_encoded',
    'headquarters_state_encoded',
    'Sector_encoded',
    'job_simp_encoded',
    'Type of ownership_encoded',
]

In [31]:
#important_features = [
#    'Rating', 'Founded', 'min_salary', 'max_salary', 'avg_salary', 'age', 
#    'desc_len', 'Agentic_Count', 'Communal_Count', 'Gendered_Ratio', 'Ratio'
#]

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline


# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)  # Ensure categorical_features are updated correctly
    ])

# Define SMOTE
smote = SMOTE(random_state=42)

# Define pipeline with SMOTE
pipeline_smote = imPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100))
])


# Fit pipeline with SMOTE
pipeline_smote.fit(X_train[important_features], y_train)

# Evaluate the pipeline
y_pred = pipeline_smote.predict(X_test[important_features])

from sklearn.metrics import confusion_matrix, classification_report

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


ValueError: A given column is not a column of the dataframe