In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

In [13]:
df = pd.read_csv('/content/Consulting_survey_DataSet (3).csv')

In [14]:
print("Available columns in dataset:")
print(df.columns.tolist())

Available columns in dataset:
['Timestamp', 'Do you CURRENTLY work at a consulting firm?', 'Which firm do you CURRENTLY work for?', 'Are you CURRENTLY in a Commercial or Federal practice? ', 'What is your CURRENT title (or equivalent)? ', 'What COUNTRY are you CURRENTLY based out of?', 'What is your CURRENT annual base compensation in USD (not including bonuses, perks, or other incentives)? Please consult Google for conversion tools for local currency into USD. Enter number values only.', 'What is the total amount in USD of BONUSES you estimate you will receive in 2021? Enter number only.', 'Expected total compensation (calculated)', 'How many hours per week on AVERAGE do you work (including non-billable time)?', 'Have you gotten an OFFER from another company this year?', 'Was the offer from another CONSULTING FIRM?', 'Which firm was the HIGHEST (base compensation) OFFER from?', 'Was the OFFER for a Commercial or Federal practice? ', 'What was the OFFERED title (or equivalent)? ', 'Wha

In [15]:
print(f"Total entries before filtering: {df.shape[0]}")

Total entries before filtering: 4200


In [16]:
df_filtered = df[df['Have you gotten an OFFER from another company this year?'] == 'Yes'].copy()

In [17]:
print(f"Entries with offers: {df_filtered.shape[0]}")

Entries with offers: 1112


In [18]:
print("Columns available in dataset after filtering:")
print(df_filtered.columns.tolist())

Columns available in dataset after filtering:
['Timestamp', 'Do you CURRENTLY work at a consulting firm?', 'Which firm do you CURRENTLY work for?', 'Are you CURRENTLY in a Commercial or Federal practice? ', 'What is your CURRENT title (or equivalent)? ', 'What COUNTRY are you CURRENTLY based out of?', 'What is your CURRENT annual base compensation in USD (not including bonuses, perks, or other incentives)? Please consult Google for conversion tools for local currency into USD. Enter number values only.', 'What is the total amount in USD of BONUSES you estimate you will receive in 2021? Enter number only.', 'Expected total compensation (calculated)', 'How many hours per week on AVERAGE do you work (including non-billable time)?', 'Have you gotten an OFFER from another company this year?', 'Was the offer from another CONSULTING FIRM?', 'Which firm was the HIGHEST (base compensation) OFFER from?', 'Was the OFFER for a Commercial or Federal practice? ', 'What was the OFFERED title (or equi

In [19]:
# Focus only on "Did you accept the offer?" column and filter valid responses
df_filtered = df_filtered[df_filtered['Did you accept the offer?'].isin(['Yes', 'No'])]

In [20]:
# Convert target variable into binary (1 = Accepted, 0 = Not Accepted)
df_filtered['Did you accept the offer?'] = df_filtered['Did you accept the offer?'].map({'Yes': 1, 'No': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Did you accept the offer?'] = df_filtered['Did you accept the offer?'].map({'Yes': 1, 'No': 0})


In [21]:
# Define numerical feature list, ensuring only available columns are used
possible_num_features = [
    'What is your CURRENT annual base compensation in USD (not including bonuses, perks, or other incentives)? Please consult Google for conversion tools for local currency into USD. Enter number values only.',
    'What is the total amount in USD of BONUSES you estimate you will receive in 2021? Enter number only.',
    'Expected total compensation (calculated)',
    'How many hours per week on AVERAGE do you work (including non-billable time)?',
    'What was the HIGHEST OFFERED annual base compensation in USD (not including bonuses, perks, or other incentives)?',
    'What was the total annual BONUS OFFERED?'
]
num_features = [col for col in possible_num_features if col in df_filtered.columns]
print("Numerical features being used:", num_features)


Numerical features being used: ['What is your CURRENT annual base compensation in USD (not including bonuses, perks, or other incentives)? Please consult Google for conversion tools for local currency into USD. Enter number values only.', 'What is the total amount in USD of BONUSES you estimate you will receive in 2021? Enter number only.', 'Expected total compensation (calculated)', 'How many hours per week on AVERAGE do you work (including non-billable time)?', 'What was the total annual BONUS OFFERED?']


In [22]:
# Function to handle non-numeric values in numerical columns
def convert_to_numeric(value):
    if isinstance(value, str):
        value = value.strip()
        if "-" in value:
            parts = value.split("-")
            try:
                return (float(parts[0]) + float(parts[1])) / 2
            except ValueError:
                return np.nan
        elif value.replace(".", "").isdigit():
            return float(value)
        else:
            return np.nan
    return value


In [23]:
# Apply conversion to all numerical features
for col in num_features:
    df_filtered[col] = df_filtered[col].apply(convert_to_numeric)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[col] = df_filtered[col].apply(convert_to_numeric)


In [24]:
# Fill missing values with column mean
df_filtered[num_features] = df_filtered[num_features].fillna(df_filtered[num_features].mean())

# Check if categorical columns exist before proceeding
possible_cat_features = ['What COUNTRY are you CURRENTLY based out of?', 'What is your CURRENT title (or equivalent)?',
                         'Are you CURRENTLY in a Commercial or Federal practice?', 'Was the OFFER for a Commercial or Federal practice?']
cat_features = [col for col in possible_cat_features if col in df_filtered.columns]

if not cat_features:
    print("Warning: No categorical features found in the dataset!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[num_features] = df_filtered[num_features].fillna(df_filtered[num_features].mean())


In [25]:
# Encode categorical features using OneHotEncoder if available
if cat_features:
    df_encoded = pd.get_dummies(df_filtered[cat_features], drop_first=True)
    X = pd.concat([df_filtered[num_features], df_encoded], axis=1)
else:
    X = df_filtered[num_features]

y = df_filtered['Did you accept the offer?']


In [26]:
# Standardize numerical features
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

In [27]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [28]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [29]:
# Print class distribution after resampling
print("Class distribution after resampling:")
print(pd.Series(y_train_resampled).value_counts())


Class distribution after resampling:
Did you accept the offer?
0    504
1    504
Name: count, dtype: int64


In [30]:
# Define models with hyperparameter tuning
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
}


In [31]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    "Random Forest": {'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_split': [2, 5]},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]},
}


In [32]:
# Train, optimize, and evaluate models using Stratified K-Fold Cross-Validation
best_model_name = None
best_model_f1 = 0
best_model_instance = None

for name, model in models.items():
    print(f"\nOptimizing {name}...")
    if name in param_grid:
        grid_search = GridSearchCV(model, param_grid[name], cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
        grid_search.fit(X_train_resampled, y_train_resampled)
        best_model = grid_search.best_estimator_
        print(f"Best Parameters for {name}: {grid_search.best_params_}")
    else:
        best_model = model.fit(X_train_resampled, y_train_resampled)

    y_pred = best_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    print(f"F1 Score: {f1:.4f}")

    # Track the best model based on F1-score
    if f1 > best_model_f1:
        best_model_f1 = f1
        best_model_name = name
        best_model_instance = best_model

print(f"\nBest Model: {best_model_name} with F1 Score: {best_model_f1:.4f}")



Optimizing Random Forest...
Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}

Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.75      0.79      0.77       127
           1       0.41      0.37      0.39        52

    accuracy                           0.66       179
   macro avg       0.58      0.58      0.58       179
weighted avg       0.65      0.66      0.66       179

F1 Score: 0.3878

Optimizing Gradient Boosting...
Best Parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}

Gradient Boosting Classification Report:

              precision    recall  f1-score   support

           0       0.76      0.70      0.73       127
           1       0.39      0.46      0.42        52

    accuracy                           0.63       179
   macro avg       0.57      0.58      0.58       179
weighted avg       0.65      0.63      0.6