In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, precision_recall_curve, auc

# Load your DataFrame (assuming it is named df)
df = pd.read_csv('model_data.csv') # Uncomment and modify if needed

# Define target and features
target = 'Gender_Bias'
numerical_features = [
    'desc_len', 'age', 'min_salary', 'avg_salary', 'max_salary', 'Rating', 
    'Founded', 'job_state_encoded', 'num_comp_encoded', 'job_simp_encoded', 
    'headquarters_state_encoded', 'excel', 'Sector_encoded', 'employer_provided', 
    'num_comp', 'Industry_encoded', 'same_state', 'aws', 'Type of ownership_encoded', 
    'seniority_encoded', 'hourly', 'spark', 'python_yn', 'R_yn'
]

# Prepare the data
X = df[numerical_features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Test set accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature importances
importances = model.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(feature_importances)

# Evaluate performance metrics
y_pred_proba = model.predict_proba(X_test)[:, 1]

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("ROC-AUC Score:", roc_auc)

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall, precision)
print("Precision-Recall AUC:", pr_auc)


Test set accuracy: 0.5369127516778524
Confusion Matrix:
[[37 31]
 [38 43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.54      0.52        68
           1       0.58      0.53      0.55        81

    accuracy                           0.54       149
   macro avg       0.54      0.54      0.54       149
weighted avg       0.54      0.54      0.54       149

Feature Importances:
                       Feature  Importance
0                     desc_len    0.112252
4                   max_salary    0.086346
3                   avg_salary    0.083243
2                   min_salary    0.083056
6                      Founded    0.075817
1                          age    0.067376
7            job_state_encoded    0.066314
5                       Rating    0.063924
15            Industry_encoded    0.061581
10  headquarters_state_encoded    0.053493
12              Sector_encoded    0.052929
9             job_simp_encoded    0.03

### Analysis
Accuracy: The accuracy is close to 0.54, indicating that the model’s performance is only slightly better than random guessing.

Confusion Matrix: The confusion matrix shows that the model has a balance between false positives and false negatives. It performs similarly on both classes but with moderate performance.
ROC-AUC Score: A ROC-AUC score of 0.50 suggests that the model has no discriminative power.

Precision-Recall AUC: A Precision-Recall AUC of 0.56 is relatively low, indicating room for improvement in distinguishing between classes.