In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
import joblib
import warnings

# Suppress warnings for clarity
warnings.filterwarnings('ignore')

# Load the sampled data
sampled_csv_path = '/content/drive/MyDrive/1.csv'
df = pd.read_csv(sampled_csv_path)

# Check if 'Attrition' column exists and rename it to 'layoff'
if 'Attrition' in df.columns:
    df.rename(columns={'Attrition': 'Layoff'}, inplace=True)
else:
    print("The 'Attrition' column does not exist in the DataFrame.")
    print("Available columns:", df.columns)

# Define columns of interest
columns_of_interest = [
    'Age', 'EducationField', 'JobRole', 'Department', 'Industry', 'Stage',
    'Education', 'Funds_Raised(m)', 'PerformanceRating', 'JobSatisfaction',
    'JobInvolvement', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'Layoff', 'MonthlyIncome','NumCompaniesWorked','Gender'
]

# Check if the selected columns exist in the DataFrame
missing_columns = [col for col in columns_of_interest if col not in df.columns]
if missing_columns:
    print(f"The following columns are missing from the DataFrame: {missing_columns}")
else:
    df = df[columns_of_interest]

    # Encode categorical variables and print the mappings
    categorical_columns = ['EducationField', 'JobRole', 'Department', 'Industry', 'Stage','Gender']

    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
        # Print the mappings
        print(f"Mappings for {col}:")
        mappings = dict(zip(le.classes_, le.transform(le.classes_)))
        for key, value in mappings.items():
            print(f"'{key}': {value}")
        print("\n")

    # Encode target variable
    target_encoder = LabelEncoder()
    df['Layoff'] = target_encoder.fit_transform(df['Layoff'])
    print(f"Target variable mappings: {dict(zip(target_encoder.classes_, target_encoder.transform(target_encoder.classes_)))}")

    # Define features and target
    X = df.drop(columns=['Layoff'])
    y = df['Layoff']

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

    # Feature selection using RFE
    rf_model = RandomForestClassifier(random_state=42)
    rfe = RFE(estimator=rf_model, n_features_to_select=10)  # Adjust the number of features as needed
    X_rfe = rfe.fit_transform(X_resampled, y_resampled)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_rfe, y_resampled, test_size=0.2, random_state=42)

    # Train a Random Forest model with hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Best model from GridSearch
    best_rf_model = grid_search.best_estimator_

    # Train and evaluate the model
    y_pred = best_rf_model.predict(X_test)

    # Ensure labels are consistent
    unique_labels = sorted(y.unique())
    if set(y_test.unique()) != set(unique_labels):
        raise ValueError(f"y_test contains labels not present in y: {y_test.unique()}")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Print detailed classification report
    report = classification_report(y_test, y_pred, target_names=target_encoder.classes_)
    print("\nClassification Report:\n", report)

    # Save the model, scaler, and RFE
    model_filename = '/content/drive/MyDrive/ADT/best_model.pkl'
    scaler_filename = '/content/drive/MyDrive/ADT/scaler.pkl'
    rfe_filename = '/content/drive/MyDrive/ADT/rfe.pkl'

    joblib.dump(best_rf_model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(rfe, rfe_filename)

    print("Model, scaler, and RFE object saved successfully.")


Mappings for EducationField:
'Human Resources': 0
'Life Sciences': 1
'Marketing': 2
'Medical': 3
'Other': 4
'Technical Degree': 5


Mappings for JobRole:
'Healthcare Representative': 0
'Human Resources': 1
'Laboratory Technician': 2
'Network Engineer': 3
'Operational Executive': 4
'Project Manager': 5
'Research Director': 6
'Research Scientist': 7
'Sales Executive': 8
'Sales Representative': 9
'Software Developer': 10
'Team Lead': 11


Mappings for Department:
'Defence': 0
'Human Resources': 1
'IT department': 2
'Research & Development': 3
'Sales': 4
'Tech Development': 5


Mappings for Industry:
'Aerospace': 0
'Education': 1
'Finance': 2
'Fitness': 3
'Food': 4
'Healthcare': 5
'Logistics': 6
'Real Estate': 7
'Retail': 8
'Sales': 9
'Transportation': 10
'Travel': 11


Mappings for Stage:
'Acquired': 0
'Post-IPO': 1
'Private Equity': 2
'Seed': 3
'Series A': 4
'Series B': 5
'Series C': 6
'Series D': 7
'Series E': 8
'Series F': 9
'Series G': 10
'Series H': 11
'Series I': 12
'Series J': 13
'

In [9]:
import pandas as pd
import joblib

# Load the model, scaler, and RFE
model_filename = '/content/drive/MyDrive/ADT/best_model.pkl'
scaler_filename = '/content/drive/MyDrive/ADT/scaler.pkl'
rfe_filename = '/content/drive/MyDrive/ADT/rfe.pkl'

loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)
loaded_rfe = joblib.load(rfe_filename)

# Features used during training (including 'Age')
training_feature_names = [
    'Age', 'EducationField', 'JobRole', 'Department', 'Industry',
    'Stage', 'Education', 'Funds_Raised(m)', 'PerformanceRating',
    'JobSatisfaction', 'JobInvolvement', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsWithCurrManager', 'MonthlyIncome',
    'NumCompaniesWorked', 'Gender'
]

# Create a sample dataset with adjusted features aiming for around 40% probability
sample_adjusted = pd.DataFrame({
    'Age': [45],  # Middle to slightly older
    'EducationField': [5],  # Higher value
    'JobRole': [11],  # Higher role value
    'Department': [5],  # Higher department value
    'Industry': [11],  # Intermediate industry value
    'Stage': [14],  # Different stage
    'Education': [4],  # Lower education level
    'Funds_Raised(m)': [10],  # Lower value
    'PerformanceRating': [4],  # Average performance
    'JobSatisfaction': [2],  # Average satisfaction
    'JobInvolvement': [2],  # Average involvement
    'YearsAtCompany': [11],  # Shorter tenure
    'YearsInCurrentRole': [4],  # Shorter duration in current role
    'YearsWithCurrManager': [7],  # Shorter duration with current manager  # Shorter duration with current manager
    'MonthlyIncome': [17379],  # Higher income
    'NumCompaniesWorked': [9],  # More companies worked
    'Gender': [1]  # Male,
})

# Ensure sample_data includes all relevant features
sample_adjusted = sample_adjusted[training_feature_names]

# Transform the sample data using the scaler
sample_adjusted_scaled = loaded_scaler.transform(sample_adjusted)

# Transform the scaled data using RFE
sample_adjusted_rfe = loaded_rfe.transform(sample_adjusted_scaled)

# Predict the probability
prob_adjusted = loaded_model.predict_proba(sample_adjusted_rfe)[:, 1]

# Print the percentage chance of layoff for the adjusted sample
print("\nPercentage Chance of Layoff for Adjusted Sample:")
for idx, p in enumerate(prob_adjusted):
    print(f"Sample {idx}: {p * 100:.2f}%")


Percentage Chance of Layoff for Adjusted Sample:
Sample 0: 85.00%
