In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,ConfusionMatrixDisplay,precision_score, recall_score, f1_score, roc_auc_score,roc_curve

In [2]:
df=pd.read_csv('cleaned_dataset.csv')
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,1264,Finance,region_7,Bachelor's,m,other,1,31,3.0,3,0,0,62,0
1,18375,Finance,region_7,Bachelor's,m,other,1,31,5.0,7,1,0,62,0
2,34167,Operations,region_2,Bachelor's,m,other,1,53,3.0,22,0,0,67,0
3,14280,Technology,region_29,Bachelor's,m,other,1,27,2.0,2,1,0,81,0
4,24219,Sales & Marketing,region_26,Bachelor's,m,other,1,44,3.0,6,1,0,57,0


In [3]:
df=df.drop(columns=['employee_id','region'])

In [4]:
x=df.drop(columns=['is_promoted'])
y=df['is_promoted']

In [5]:
y.value_counts()

is_promoted
0    30000
1     2884
Name: count, dtype: int64

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

The data is imbalanced so choosing upscaling

In [7]:
train_df = pd.concat([x_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

# Separate the majority and minority classes
majority_class = train_df[train_df[y_train.name] == y_train.value_counts().idxmax()]
minority_class = train_df[train_df[y_train.name] == y_train.value_counts().idxmin()]

# Perform random oversampling
n_majority = len(majority_class)
n_minority = len(minority_class)

# Randomly sample the minority class with replacement
minority_oversampled = minority_class.sample(n=8000, replace=True, random_state=42)

# Combine the majority class with the oversampled minority class
train_df_oversampled = pd.concat([majority_class, minority_oversampled])

# Shuffle the DataFrame to mix the classes
train_df_oversampled = train_df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and labels
x_resampled = train_df_oversampled.drop(columns=[y_train.name])
y_resampled = train_df_oversampled[y_train.name]

# Check the distribution of the resampled labels
y_resampled_counts = y_resampled.value_counts()


# Optional: Display the first few rows of the resampled training data


In [8]:
x_train=train_df_oversampled.drop(columns=['is_promoted'])
y_train=train_df_oversampled["is_promoted"]

In [9]:
cat_features=x.select_dtypes(include='object').columns
num_features=x.select_dtypes(exclude='object').columns

numeric_tansformer=StandardScaler()
ohe_transformer=OneHotEncoder(drop='first')
preprocessor=ColumnTransformer(
    [
        ("one hot encoder",ohe_transformer,cat_features),
        ("Standard scaler",numeric_tansformer,num_features)
    ]
)
x_train=preprocessor.fit_transform(x_train)
x_test=preprocessor.transform(x_test)

In [10]:
models={
    "Logisitic Regression":LogisticRegression(),
    "Random forest classifier":RandomForestClassifier(),
    "Decision tree":DecisionTreeClassifier(),
    "gradient boost":GradientBoostingClassifier(),
    "Support vector machine":SVC(),
    "knearest neighbours":KNeighborsClassifier(),
#    "Naive bias":naive_bayes()
}   
def report(models):

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(x_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)

        # Training set performance
        model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
        model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
        model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
        model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
        model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
        model_train_confusion_matrix=confusion_matrix(y_train, y_train_pred)


        # Test set performance
        model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
        model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
        model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
        model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
        model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc
        model_test_confusion_matrix=confusion_matrix(y_test, y_test_pred)


        print(list(models.keys())[i])
        
        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1))
        
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
        print('- confusion matrix:\n{}'.format(model_train_confusion_matrix))

        
        
        print('----------------------------------')
        
        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_test_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        print('- confusion matrix:\n{}'.format(model_test_confusion_matrix))
        
        print('='*35)
        print('\n')


In [11]:
report(models)

Logisitic Regression
Model performance for Training set
- Accuracy: 0.8160
- F1 score: 0.8043
- Precision: 0.6870
- Recall: 0.4838
- Roc Auc Score: 0.7052
- confusion matrix:
[[22261  1763]
 [ 4130  3870]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8857
- F1 score: 0.8902
- Precision: 0.3950
- Recall: 0.4725
- Roc Auc Score: 0.6999
- confusion matrix:
[[5541  435]
 [ 317  284]]


Random forest classifier
Model performance for Training set
- Accuracy: 0.9983
- F1 score: 0.9983
- Precision: 0.9937
- Recall: 0.9994
- Roc Auc Score: 0.9986
- confusion matrix:
[[23973    51]
 [    5  7995]]
----------------------------------
Model performance for Test set
- Accuracy: 0.9168
- F1 score: 0.9073
- Precision: 0.5746
- Recall: 0.3461
- Roc Auc Score: 0.6602
- confusion matrix:
[[5822  154]
 [ 393  208]]


Decision tree
Model performance for Training set
- Accuracy: 0.9983
- F1 score: 0.9983
- Precision: 0.9939
- Recall: 0.9991
- Roc Auc Score: 0.9985
- confus

Selected Logistic,SVM, gradient

In [12]:
models={

    "Logisitic Regression":LogisticRegression(),

    "gradient boost":GradientBoostingClassifier(),
    "Support vector machine":SVC(),

}  
report(models)

Logisitic Regression
Model performance for Training set
- Accuracy: 0.8160
- F1 score: 0.8043
- Precision: 0.6870
- Recall: 0.4838
- Roc Auc Score: 0.7052
- confusion matrix:
[[22261  1763]
 [ 4130  3870]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8857
- F1 score: 0.8902
- Precision: 0.3950
- Recall: 0.4725
- Roc Auc Score: 0.6999
- confusion matrix:
[[5541  435]
 [ 317  284]]


gradient boost
Model performance for Training set
- Accuracy: 0.8451
- F1 score: 0.8271
- Precision: 0.8561
- Recall: 0.4565
- Roc Auc Score: 0.7155
- confusion matrix:
[[23410   614]
 [ 4348  3652]]
----------------------------------
Model performance for Test set
- Accuracy: 0.9202
- F1 score: 0.9132
- Precision: 0.5931
- Recall: 0.4027
- Roc Auc Score: 0.6874
- confusion matrix:
[[5810  166]
 [ 359  242]]


Support vector machine
Model performance for Training set
- Accuracy: 0.8536
- F1 score: 0.8378
- Precision: 0.8723
- Recall: 0.4851
- Roc Auc Score: 0.7307
- confusi

### Hyper parameter tunning for gradient boosting

In [13]:
model={
    "gradient boost":GradientBoostingClassifier(loss='exponential',learning_rate=0.02,n_estimators=500,max_depth=7,subsample=0.8,criterion='squared_error',min_samples_split=4)
}
report(model)

gradient boost
Model performance for Training set
- Accuracy: 0.9126
- F1 score: 0.9084
- Precision: 0.9243
- Recall: 0.7080
- Roc Auc Score: 0.8443
- confusion matrix:
[[23560   464]
 [ 2336  5664]]
----------------------------------
Model performance for Test set
- Accuracy: 0.9176
- F1 score: 0.9121
- Precision: 0.5657
- Recall: 0.4226
- Roc Auc Score: 0.6950
- confusion matrix:
[[5781  195]
 [ 347  254]]


