In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,classification_report
from sklearn.utils import resample


In [2]:
# Load dataset
df = pd.read_csv('train_LZdllcl.csv')
pd.set_option('display.max_columns', None)

# Handling null values
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['previous_year_rating'] = df['previous_year_rating'].fillna(df['previous_year_rating'].median())


In [3]:
# Drop 'employee_id'
df.drop('employee_id', axis=1, inplace=True)

In [4]:
columns_to_clip = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service']

for column in columns_to_clip:
    Q1 = np.percentile(df[column], 25)
    Q3 = np.percentile(df[column], 75)
    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    upp_lim = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=low_lim, upper=upp_lim)
    

In [5]:
le = LabelEncoder()
df['department'] = le.fit_transform(df['department'])

df['region'] = df['region'].str.extract(r'(\d+)').astype(int)


od_education = OrdinalEncoder(categories=[["Below Secondary", "Bachelor's", "Master's & above"]], dtype=int)
df['education'] = od_education.fit_transform(df[['education']])

df['gender'] = le.fit_transform(df['gender'])

od_recruitment_channel = OrdinalEncoder(categories=[["other", "sourcing", "referred"]], dtype=int)
df['recruitment_channel'] = od_recruitment_channel.fit_transform(df[['recruitment_channel']])

In [6]:
# Normalization
standard_scaler = StandardScaler()
standard_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
df[standard_columns] = standard_scaler.fit_transform(df[standard_columns])



In [7]:
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]
undersample = resample(cls_false,n_samples=len(cls_true), random_state=119)

bal_df = pd.concat([undersample, cls_true])

In [8]:
X = bal_df.drop('is_promoted', axis=1)
y = bal_df['is_promoted']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Define the Random Forest model
rf_model = RandomForestClassifier()

# Define the hyperparameter grid to search through
param_grid = {
    'n_estimators': range(50, 60),
    'max_depth': range(50, 60),
    'random_state': [119]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(class_report)

Best Hyperparameters: {'max_depth': 50, 'n_estimators': 57, 'random_state': 119}
Accuracy: 0.80
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.73      0.78       931
           1       0.76      0.87      0.81       937

    accuracy                           0.80      1868
   macro avg       0.81      0.80      0.80      1868
weighted avg       0.81      0.80      0.80      1868



In [21]:
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]
oversample = resample(cls_true,n_samples=len(cls_false), random_state=42)

ov_bal_df = pd.concat([oversample, cls_false])

In [22]:
ov_bal_df['is_promoted'].value_counts()

is_promoted
1    50140
0    50140
Name: count, dtype: int64

In [23]:
X = ov_bal_df.drop('is_promoted', axis=1)
y = ov_bal_df['is_promoted']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:



rm = range(1,101)

bst_rm =  None
best_acc = 0


for x in rm:
 model_RF = RandomForestClassifier(random_state=x)
 model_RF.fit(X_train, y_train)
# Make predictions on the test set
 y_pred = model_RF.predict(X_test)
# Calculate accuracy
 accuracy = accuracy_score(y_test, y_pred)
 if accuracy > best_acc:
     best_acc = accuracy
     bst_rm = x
     
print(f'random state : {bst_rm} Accuracy of {x}: {best_acc}')
#  class_report = classification_report(y_test, y_pred)
# Print the classification report
#  print(f"Classification Report of {x}:")
#  print(class_report)

NameError: name 'X_train' is not defined

In [30]:
X = df.drop('is_promoted', axis=1)
y = df['is_promoted']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [14]:


# Assuming 'df' is your original DataFrame

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Oversample the minority class only in the training set
oversample = resample(cls_true, n_samples=len(cls_false), random_state=40)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])



# Initialize the Random Forest model
 model_RF = RandomForestClassifier(random_state=x)

# Train the model with the oversampled training set
 model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the test set
 y_pred = model_RF.predict(X_test)

# Calculate accuracy
 accuracy = accuracy_score(y_test, y_pred)
 print(f'Accuracy of {x}: {accuracy}')
 
 
 

# Print the classification report
 class_report = classification_report(y_test, y_pred)
 print(f"Classification Report of {x}:")
 print(class_report)


Accuracy of 1: 0.9719029374201787
Classification Report of 1:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.74      1.00      0.85       881

    accuracy                           0.97     10962
   macro avg       0.87      0.98      0.92     10962
weighted avg       0.98      0.97      0.97     10962

Accuracy of 2: 0.9720853858784894
Classification Report of 2:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.74      1.00      0.85       881

    accuracy                           0.97     10962
   macro avg       0.87      0.98      0.92     10962
weighted avg       0.98      0.97      0.97     10962

Accuracy of 3: 0.9713555920452472
Classification Report of 3:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.74      1.00      0.85       8

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Apply SMOTE only to the training set
smote = SMOTE(random_state=40)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the oversampled training set
model_RF.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# Print the classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Accuracy: 0.9137018792191206
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     10081
           1       0.46      0.40      0.43       881

    accuracy                           0.91     10962
   macro avg       0.70      0.68      0.69     10962
weighted avg       0.91      0.91      0.91     10962



In [27]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
import pandas as pd

# Assuming 'df' is your original DataFrame

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Oversample the minority class only in the training set
oversample = resample(cls_true, n_samples=len(cls_false), random_state=40)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

# Define the parameter grid
param_grid = {
    'n_estimators':[30,32],
    'max_depth': [35],
    'random_state': [119]  # Keeping the random_state constant for consistency
}

# Initialize the Random Forest model
model_RF = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model_RF, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search on the oversampled training set
grid_search.fit(X_train_oversampled, y_train_oversampled)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print the classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Best Parameters: {'max_depth': 35, 'n_estimators': 32, 'random_state': 119}
Accuracy: 0.9705345739828498
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.73      1.00      0.85       881

    accuracy                           0.97     10962
   macro avg       0.87      0.98      0.91     10962
weighted avg       0.98      0.97      0.97     10962



In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets with a specified random_state
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Initialize variables to store the best random state and accuracy
best_random_state = None
best_accuracy = 0

# Try different random state values
for random_state in range(100, 201):
    # Oversample the minority class only in the training set
    oversample = resample(cls_true, n_samples=len(cls_false), random_state=random_state)
    X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
    y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

    # Initialize the Random Forest model
    model_RF = RandomForestClassifier(random_state=random_state)

    # Train the model with the oversampled training set
    model_RF.fit(X_train_oversampled, y_train_oversampled)

    # Make predictions on the test set
    y_pred = model_RF.predict(X_test)

    # Calculate accuracy
    current_accuracy = accuracy_score(y_test, y_pred)

    # Update best_random_state if the current iteration has a higher accuracy
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        best_random_state = random_state

# Print the best random state and accuracy
print(f"Best Random State: {best_random_state}, Best Accuracy: {best_accuracy}")

# Train the final model with the best random state
oversample = resample(cls_true, n_samples=len(cls_false), random_state=best_random_state)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

final_model_RF = RandomForestClassifier(random_state=best_random_state)
final_model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the test set using the final model
y_pred_final = final_model_RF.predict(X_test)

# Calculate and print the final accuracy and classification report
accuracy_final = accuracy_score(y_test, y_pred_final)
print(f'Final Accuracy: {accuracy_final}')

class_report_final = classification_report(y_test, y_pred_final)
print(f"Final Classification Report:")
print(class_report_final)


Best Random State: 119, Best Accuracy: 0.9740923189199051
Final Accuracy: 0.9740923189199051
Final Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     10081
           1       0.76      1.00      0.86       881

    accuracy                           0.97     10962
   macro avg       0.88      0.99      0.92     10962
weighted avg       0.98      0.97      0.98     10962

