# Model Exploration

In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('gendered_data.csv')

print(df.columns)

Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'company_txt', 'job_state', 'same_state', 'age', 'python_yn', 'R_yn',
       'spark', 'aws', 'excel', 'job_simp', 'seniority', 'desc_len',
       'num_comp', 'headquarters_state', 'Lemmatized_Description',
       'Agentic_Words', 'Communal_Words', 'Agentic_Count', 'Communal_Count',
       'Gendered_Ratio', 'job_state_encoded', 'headquarters_state_encoded',
       'Type of ownership_encoded', 'Industry_encoded', 'Sector_encoded',
       'job_simp_encoded', 'seniority_encoded', 'num_comp_encoded', 'Ratio',
       'Gender_Bias'],
      dtype='object')


In [2]:
# Check the distribution of the target column
print(df['Gender_Bias'].value_counts())

Gender_Bias
0    734
1      8
Name: count, dtype: int64


### Apply SMOTE

In [3]:
# Define the target and features
target = 'Gender_Bias'

# List of numerical features
numerical_features = [
    'Rating', 'Founded', 'hourly', 'employer_provided', 'min_salary', 
    'max_salary', 'avg_salary', 'same_state', 'age', 'python_yn', 'R_yn', 'spark', 
    'aws', 'excel', 'desc_len', 'num_comp', 'Agentic_Count', 'Communal_Count', 
    'job_state_encoded', 'headquarters_state_encoded', 
    'Type of ownership_encoded', 'Industry_encoded', 'Sector_encoded', 
    'job_simp_encoded', 'seniority_encoded', 'num_comp_encoded'
]

# Split the data into features and target
X = df[numerical_features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Check the class distribution after applying SMOTE
print("Class distribution after SMOTE:")
print(y_train_res.value_counts())

# X_train_res and y_train_res can now be used for training your model


Class distribution after SMOTE:
Gender_Bias
1    586
0    586
Name: count, dtype: int64


# END

### Hyperparameter Tuning with GridSearchCV

We will use `GridSearchCV` to find the best hyperparameters for the Random Forest model. This involves searching over a range of hyperparameters to identify the optimal configuration.

```python
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_de

2. Cross-Validation
Cross-validation helps in understanding how well the model generalizes to unseen data.

In [5]:
from sklearn.model_selection import cross_val_score

# Initialize the Random Forest model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(best_rf, X, y, cv=5)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


Cross-Validation Scores: [0.98657718 1.         1.         1.         0.99324324]
Mean CV Score: 0.9959640848902593


3. Evaluation with Best Model
After finding the best parameters, you can re-evaluate the model on the test set.


**Cell 2: Cross-Validation**

```markdown
### Cross-Validation

To assess the model's generalization performance, we will use cross-validation with the best hyperparameters found.

```python
from sklearn.model_selection import cross_val_score

# Initialize the Random Forest model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(best_rf, X, y, cv=5)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


In [6]:
from sklearn.metrics import classification_report, accuracy_score

# Fit the best model on the training data
best_rf.fit(X_train, y_train)

# Predict on the test data
y_pred = best_rf.predict(X_test)

# Print the accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Accuracy: 0.9932885906040269
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       148
           1       0.00      0.00      0.00         1

    accuracy                           0.99       149
   macro avg       0.50      0.50      0.50       149
weighted avg       0.99      0.99      0.99       149



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



**Cell 3: Evaluation with Best Model**

```markdown
### Evaluation with the Best Model

We will evaluate the Random Forest model with the best hyperparameters on the test set.

```python
from sklearn.metrics import classification_report, accuracy_score

# Fit the best model on the training data
best_rf.fit(X_train, y_train)

# Predict on the test data
y_pred = best_rf.predict(X_test)

# Print the accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


In [7]:
from sklearn.model_selection import GridSearchCV

# Example for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.9966386554621849


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature Selection
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, 3)  # Select top 3 features
fit = rfe.fit(X_scaled, y)

print("Selected Features:", fit.support_)
print("Feature Ranking:", fit.ranking_)


TypeError: RFE.__init__() takes 2 positional arguments but 3 were given

---

### Feature Importance

In this cell, we plot the feature importances from the trained model. This helps in understanding which features are most influential in predicting gender bias.


In [9]:
# Plot feature importances
importances = model.feature_importances_
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=features)
plt.title('Feature Importances')
plt.show()


NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Data Preprocessing

First, we need to preprocess the textual data and handle categorical and numerical features. This includes tokenizing text, removing stop words, lemmatizing, and vectorizing text. Additionally, we'll handle missing values, encode categorical features, and scale numerical features.


In [10]:
# Load Data
df = pd.read_csv('gendered_data.csv')

# Text Preprocessing Functions
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    # Tokenization
    tokens = word_tokenize(text)
    # Stop Words Removal
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Ensure that text columns are strings and handle missing values
df['Job Description'] = df['Job Description'].astype(str).fillna('')
df['Agentic_Words'] = df['Agentic_Words'].astype(str).fillna('')
df['Communal_Words'] = df['Communal_Words'].astype(str).fillna('')

# Apply text preprocessing to textual features
df['Job Description'] = df['Job Description'].apply(preprocess_text)
df['Agentic_Words'] = df['Agentic_Words'].apply(preprocess_text)
df['Communal_Words'] = df['Communal_Words'].apply(preprocess_text)

NameError: name 'word_tokenize' is not defined

# Define Features and Preprocessing

Next, we'll define the categorical and numerical features, handle missing values, encode categorical variables, and scale numerical features. We will also vectorize the text features using TF-IDF.


In [14]:
# Define categorical and numerical features
categorical_features = ['Industry', 'Sector', 'Size', 'Location', 'Job_simp', 'Seniority']
numerical_features = ['Rating', 'Age', 'Agentic_Count', 'Communal_Count']
text_features = ['Job Description', 'Agentic_Words', 'Communal_Words']

# Handling Missing Values and Encoding Categorical Features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        
        ('text', Pipeline(steps=[
            ('tfidf', TfidfVectorizer(max_features=5000))
        ]), text_features)
    ])


NameError: name 'SimpleImputer' is not defined

# Train-Test Split

We will split the data into training and testing sets for model training and evaluation.


In [15]:
# Separate target variable and features
X = df.drop('Target', axis=1)  # Replace 'Target' with your actual target column
y = df['Target']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: "['Target'] not found in axis"

# Model Building

We will create a pipeline that includes preprocessing and modeling. We'll use a RandomForestClassifier as an example, but this can be replaced with other models.


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a Pipeline for Preprocessing and Modeling
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# Pipeline Creation

The above pipeline integrates preprocessing and model training/testing steps, ensuring consistent application during both training and evaluation.


### Support Vector Machine (SVM)
- Advantages: Effective in high-dimensional spaces, and robust to overfitting in high-dimensional data.
- Considerations: Requires careful tuning of hyperparameters like C (regularization) and gamma (kernel coefficient).

In [13]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Example pipeline
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', SVC(kernel='linear', random_state=42))
])


### Gradient Boosting Machines (GBMs):
- Advantages: Typically performs well on a variety of problems and can handle different types of data.
- Considerations: Models like GradientBoostingClassifier or HistGradientBoostingClassifier can be effective.

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)


### Neural Networks:

- Advantages: Can capture complex patterns and interactions in data.
- Considerations: Requires careful tuning of network architecture and hyperparameters. Libraries like TensorFlow or PyTorch can be used.

In [11]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=200, random_state=42)
