In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,classification_report
from sklearn.utils import resample


In [2]:
# Load dataset
df = pd.read_csv('train_LZdllcl.csv')
pd.set_option('display.max_columns', None)
df.head()
# Handling null values
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['previous_year_rating'] = df['previous_year_rating'].fillna(df['previous_year_rating'].median())


In [3]:
# Drop 'employee_id'
df.drop('employee_id', axis=1, inplace=True)
df.head()



Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
df[df['is_promoted'] == 1]

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
11,Sales & Marketing,region_7,Bachelor's,f,sourcing,1,35,5.0,3,1,0,50,1
39,Sales & Marketing,region_28,Bachelor's,m,sourcing,1,33,5.0,6,1,0,51,1
60,Sales & Marketing,region_4,Master's & above,m,other,1,50,4.0,17,1,0,47,1
66,Finance,region_22,Bachelor's,m,other,1,27,3.0,1,1,1,58,1
67,Sales & Marketing,region_22,Bachelor's,m,sourcing,1,27,3.0,1,0,0,61,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54734,Operations,region_15,Bachelor's,m,sourcing,1,31,3.0,1,1,0,56,1
54757,Technology,region_7,Master's & above,m,other,1,54,4.0,7,0,0,81,1
54761,Procurement,region_13,Bachelor's,f,sourcing,1,30,4.0,2,1,0,86,1
54792,Sales & Marketing,region_14,Bachelor's,m,other,1,59,3.0,11,0,0,65,1


In [5]:
columns_to_clip = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service']

for column in columns_to_clip:
    Q1 = np.percentile(df[column], 25)
    Q3 = np.percentile(df[column], 75)
    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    upp_lim = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=low_lim, upper=upp_lim)
    

In [6]:
le = LabelEncoder()
df['department'] = le.fit_transform(df['department'])

df['region'] = df['region'].str.extract(r'(\d+)').astype(int)


od_education = OrdinalEncoder(categories=[["Below Secondary", "Bachelor's", "Master's & above"]], dtype=int)
df['education'] = od_education.fit_transform(df[['education']])

df['gender'] = le.fit_transform(df['gender'])

od_recruitment_channel = OrdinalEncoder(categories=[["other", "sourcing", "referred"]], dtype=int)
df['recruitment_channel'] = od_recruitment_channel.fit_transform(df[['recruitment_channel']])

In [7]:
# Normalization
standard_scaler = StandardScaler()
standard_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
df[standard_columns] = standard_scaler.fit_transform(df[standard_columns])



In [29]:
df.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,7,7,2,0,1,1,0.025598,1.395766,0.50046,1,0,-1.075931,0
1,4,22,1,1,0,1,-0.627135,1.395766,-0.437395,0,0,-0.253282,0
2,7,19,1,1,1,1,-0.104948,-0.250651,0.265996,0,0,-1.001145,0
3,7,23,1,1,0,2,0.547785,-1.897069,0.969387,0,0,-1.001145,0
4,8,26,1,1,0,1,1.331064,-0.250651,-0.906322,0,0,0.718939,0


In [7]:
df[df['is_promoted'] == 1]

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
11,7,7,1,0,1,1,0.038093,1.472146,-0.752950,1,0,-1.001145,1
39,7,28,1,1,1,1,-0.231614,1.472146,0.125697,1,0,-0.926359,1
60,7,4,2,1,0,1,2.060893,0.573810,2.175872,1,0,-1.225504,1
66,1,22,1,1,0,1,-1.040734,-0.324526,-1.338714,1,1,-0.402855,1
67,7,22,1,1,1,1,-1.040734,-0.324526,-1.338714,0,0,-0.178496,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54734,4,15,1,1,1,1,-0.501320,-0.324526,-1.338714,1,0,-0.552428,1
54757,8,7,2,1,0,1,2.600306,0.573810,0.418579,0,0,1.317229,1
54761,5,13,1,0,1,1,-0.636174,0.573810,-1.045832,1,0,1.691161,1
54792,7,14,1,1,0,1,2.600306,-0.324526,1.590107,0,0,0.120649,1


In [8]:
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]
undersample = resample(cls_false,n_samples=len(cls_true), random_state=119)

bal_df = pd.concat([undersample, cls_true])

In [9]:
X = bal_df.drop('is_promoted', axis=1)
y = bal_df['is_promoted']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.utils import resample



# Assuming 'df' is your DataFrame containing the dataset
# Assuming you have 'le', 'od_education', 'od_recruitment_channel' as the encoder instances used on the original DataFrame

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Get input features from the user through the terminal
user_input_features = {}
for i, feature in enumerate(df.columns[:-1]):  # Assuming the last column is the target variable 'is_promoted'
    while True:
        value = input(f"Enter the value for {feature}: ")
        try:
            if i < 5:
                user_input_features[feature] = [value]  # Assuming the first 5 features are strings
            else:
                user_input_features[feature] = [int(value)]  # Assuming the rest are integers
            break  # Break the loop if conversion is successful
        except ValueError:
            print("Invalid input. Please enter a valid value.")

# Create a DataFrame with the user input
user_input_df = pd.DataFrame(user_input_features)

# Apply the same encoding and scaling transformations to the user-entered data
# Use the same encoders as used on the original DataFrame
user_input_df['department'] = le.transform(user_input_df['department'])

# Assuming 'region' in the user input should be treated similarly (extracting digits)
user_input_df['region'] = user_input_df['region'].str.extract(r'(\d+)').astype(int)

user_input_df['education'] = od_education.transform(user_input_df[['education']])
user_input_df['gender'] = le.transform(user_input_df['gender'])
user_input_df['recruitment_channel'] = od_recruitment_channel.transform(user_input_df[['recruitment_channel']])

# Assuming 'age', 'previous_year_rating', 'length_of_service', 'avg_training_score' are numeric
numeric_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
user_input_df[numeric_columns] = standard_scaler.transform(user_input_df[numeric_columns])

# Display the user-entered DataFrame after encoding and scaling for testing
print("User-Entered DataFrame (Encoded and Scaled):")
print(user_input_df)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the oversampled training set (assuming 'df' is used for training)
model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the user input
user_pred = model_RF.predict(user_input_df)

# Print the prediction
print(f'Model prediction for the given input: {user_pred}')


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# Assuming 'df' is your DataFrame containing the dataset

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=119)

# Oversample the minority class only in the training set
oversample = resample(cls_true, n_samples=len(cls_false), random_state=119)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

# Get input features from the user through the terminal
user_input_features = {}
for feature in df.columns[:-1]:  # Assuming the last column is the target variable 'is_promoted'
    value = input(f"Enter the value for {feature}: ")
    user_input_features[feature] = [float(value)]  # Assuming all features are numeric, modify accordingly if needed

# Create a DataFrame with the user input
user_input_df = pd.DataFrame(user_input_features)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the oversampled training set
model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the user input
user_pred = model_RF.predict(user_input_df)

# Print the prediction
print(f'Model prediction for the given input: {user_pred}')


ValueError: could not convert string to float: ''

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

# Assuming 'df' is your DataFrame containing the dataset

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=119)

# Oversample the minority class only in the training set
oversample = resample(cls_true, n_samples=len(cls_false), random_state=119)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

# Get input features from the user through the terminal
user_input_features = {}
for feature in df.columns[:4]:  # Assuming the last column is the target variable 'is_promoted'
    value = input(f"Enter the value for {feature}: ")
    user_input_features[feature] = [float(value)]  # Assuming all features are numeric, modify accordingly if needed

# Create a DataFrame with the user input
user_input_df = pd.DataFrame(user_input_features)

# Display the user-entered DataFrame for testing
print("User-Entered DataFrame:")
print(user_input_df)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the oversampled training set
model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the user input
user_pred = model_RF.predict(user_input_df)

# Print the prediction
print(f'Model prediction for the given input: {user_pred}')


User-Entered DataFrame:
   department  region  education  gender
0         1.0     1.0        1.0     1.0


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- KPIs_met >80%
- age
- avg_training_score
- awards_won?
- length_of_service
- ...


In [20]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

# Assuming 'df' is your DataFrame containing the dataset

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Get input features from the user through the terminal
user_input_features = {}
for i, feature in enumerate(df.columns[:-1]):  # Assuming the last column is the target variable 'is_promoted'
    while True:
        value = input(f"Enter the value for {feature}: ")
        try:
            if i < 5:
             user_input_features[feature] = [(value)]      
            else:
                user_input_features[feature] = [int(value)]   
            break  # Break the loop if conversion is successful
        except ValueError:
            print("Invalid input. Please enter a valid numerical value.")

# Create a DataFrame with the user input
user_input_df = pd.DataFrame(user_input_features)


# Display the user-entered DataFrame for testing
print("User-Entered DataFrame:")
print(user_input_df)

# Initialize the Random Forest model
# model_RF = RandomForestClassifier(random_state=40)

# # Train the model with the oversampled training set
# model_RF.fit(X_train_oversampled, y_train_oversampled)

# # Make predictions on the user input
# user_pred = model_RF.predict(user_input_df)

# # Print the prediction
# print(f'Model prediction for the given input: {user_pred}')


User-Entered DataFrame:
  department region education gender recruitment_channel  no_of_trainings  \
0         as     as        as     as                  as                1   

   age  previous_year_rating  length_of_service  KPIs_met >80%  awards_won?  \
0  0.0                   0.0                0.0              1            1   

   avg_training_score  
0                 0.0  


In [10]:
# Define the Random Forest model
rf_model = RandomForestClassifier()

# Define the hyperparameter grid to search through
param_grid = {
    'n_estimators': range(50, 60),
    'max_depth': range(50, 60),
    'random_state': [119]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(class_report)

Best Hyperparameters: {'max_depth': 50, 'n_estimators': 57, 'random_state': 119}
Accuracy: 0.80
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.73      0.78       931
           1       0.76      0.87      0.81       937

    accuracy                           0.80      1868
   macro avg       0.81      0.80      0.80      1868
weighted avg       0.81      0.80      0.80      1868



In [21]:
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]
oversample = resample(cls_true,n_samples=len(cls_false), random_state=42)

ov_bal_df = pd.concat([oversample, cls_false])

In [22]:
ov_bal_df['is_promoted'].value_counts()

is_promoted
1    50140
0    50140
Name: count, dtype: int64

In [23]:
X = ov_bal_df.drop('is_promoted', axis=1)
y = ov_bal_df['is_promoted']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:



rm = range(1,101)

bst_rm =  None
best_acc = 0


for x in rm:
 model_RF = RandomForestClassifier(random_state=x)
 model_RF.fit(X_train, y_train)
# Make predictions on the test set
 y_pred = model_RF.predict(X_test)
# Calculate accuracy
 accuracy = accuracy_score(y_test, y_pred)
 if accuracy > best_acc:
     best_acc = accuracy
     bst_rm = x
     
print(f'random state : {bst_rm} Accuracy of {x}: {best_acc}')
#  class_report = classification_report(y_test, y_pred)
# Print the classification report
#  print(f"Classification Report of {x}:")
#  print(class_report)

NameError: name 'X_train' is not defined

In [30]:
X = df.drop('is_promoted', axis=1)
y = df['is_promoted']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [14]:


# Assuming 'df' is your original DataFrame

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Oversample the minority class only in the training set
oversample = resample(cls_true, n_samples=len(cls_false), random_state=40)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])



# Initialize the Random Forest model
 model_RF = RandomForestClassifier(random_state=x)

# Train the model with the oversampled training set
 model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the test set
 y_pred = model_RF.predict(X_test)

# Calculate accuracy
 accuracy = accuracy_score(y_test, y_pred)
 print(f'Accuracy of {x}: {accuracy}')
 
 
 

# Print the classification report
 class_report = classification_report(y_test, y_pred)
 print(f"Classification Report of {x}:")
 print(class_report)


Accuracy of 1: 0.9719029374201787
Classification Report of 1:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.74      1.00      0.85       881

    accuracy                           0.97     10962
   macro avg       0.87      0.98      0.92     10962
weighted avg       0.98      0.97      0.97     10962

Accuracy of 2: 0.9720853858784894
Classification Report of 2:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.74      1.00      0.85       881

    accuracy                           0.97     10962
   macro avg       0.87      0.98      0.92     10962
weighted avg       0.98      0.97      0.97     10962

Accuracy of 3: 0.9713555920452472
Classification Report of 3:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.74      1.00      0.85       8

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Apply SMOTE only to the training set
smote = SMOTE(random_state=40)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the oversampled training set
model_RF.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# Print the classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Accuracy: 0.9137018792191206
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     10081
           1       0.46      0.40      0.43       881

    accuracy                           0.91     10962
   macro avg       0.70      0.68      0.69     10962
weighted avg       0.91      0.91      0.91     10962



In [27]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
import pandas as pd

# Assuming 'df' is your original DataFrame

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Oversample the minority class only in the training set
oversample = resample(cls_true, n_samples=len(cls_false), random_state=40)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

# Define the parameter grid
param_grid = {
    'n_estimators':[30,32],
    'max_depth': [35],
    'random_state': [119]  # Keeping the random_state constant for consistency
}

# Initialize the Random Forest model
model_RF = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model_RF, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search on the oversampled training set
grid_search.fit(X_train_oversampled, y_train_oversampled)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print the classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Best Parameters: {'max_depth': 35, 'n_estimators': 32, 'random_state': 119}
Accuracy: 0.9705345739828498
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     10081
           1       0.73      1.00      0.85       881

    accuracy                           0.97     10962
   macro avg       0.87      0.98      0.91     10962
weighted avg       0.98      0.97      0.97     10962



In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Split the data into training and testing sets with a specified random_state
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_promoted', axis=1), df['is_promoted'], test_size=0.2, random_state=40)

# Initialize variables to store the best random state and accuracy
best_random_state = None
best_accuracy = 0

# Try different random state values
for random_state in range(100, 201):
    # Oversample the minority class only in the training set
    oversample = resample(cls_true, n_samples=len(cls_false), random_state=random_state)
    X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
    y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

    # Initialize the Random Forest model
    model_RF = RandomForestClassifier(random_state=random_state)

    # Train the model with the oversampled training set
    model_RF.fit(X_train_oversampled, y_train_oversampled)

    # Make predictions on the test set
    y_pred = model_RF.predict(X_test)

    # Calculate accuracy
    current_accuracy = accuracy_score(y_test, y_pred)

    # Update best_random_state if the current iteration has a higher accuracy
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        best_random_state = random_state

# Print the best random state and accuracy
print(f"Best Random State: {best_random_state}, Best Accuracy: {best_accuracy}")

# Train the final model with the best random state
oversample = resample(cls_true, n_samples=len(cls_false), random_state=best_random_state)
X_train_oversampled = pd.concat([X_train, oversample.drop('is_promoted', axis=1)])
y_train_oversampled = pd.concat([y_train, oversample['is_promoted']])

final_model_RF = RandomForestClassifier(random_state=best_random_state)
final_model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the test set using the final model
y_pred_final = final_model_RF.predict(X_test)

# Calculate and print the final accuracy and classification report
accuracy_final = accuracy_score(y_test, y_pred_final)
print(f'Final Accuracy: {accuracy_final}')

class_report_final = classification_report(y_test, y_pred_final)
print(f"Final Classification Report:")
print(class_report_final)


Best Random State: 119, Best Accuracy: 0.9740923189199051
Final Accuracy: 0.9740923189199051
Final Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     10081
           1       0.76      1.00      0.86       881

    accuracy                           0.97     10962
   macro avg       0.88      0.99      0.92     10962
weighted avg       0.98      0.97      0.98     10962



In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

# Assuming 'df' is your DataFrame containing the dataset

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Get input features from the user through the terminal
user_input_features = {}
for i, feature in enumerate(df.columns[:-1]):  # Assuming the last column is the target variable 'is_promoted'
    while True:
        value = input(f"Enter the value for {feature}: ")
        try:
            if i < 5:
                user_input_features[feature] = [str(value)]  # Assuming the first 5 features are strings
            else:
                user_input_features[feature] = [int(value)]  # Assuming the rest are integers
            break  # Break the loop if conversion is successful
        except ValueError:
            print("Invalid input. Please enter a valid value.")

# Create a DataFrame with the user input
user_input_df = pd.DataFrame(user_input_features)

# Apply the same encoding and scaling transformations to the user-entered data
le = LabelEncoder()
user_input_df['department'] = le.fit_transform(user_input_df['department'])

# Assuming 'region' in the user input should be treated similarly (extracting digits)
user_input_df['region'] = user_input_df['region'].str.extract(r'(\d+)').astype(int)

od_education = OrdinalEncoder(categories=[["Below Secondary", "Bachelor's", "Master's & above"]], dtype=int)
user_input_df['education'] = od_education.fit_transform(user_input_df[['education']])

user_input_df['gender'] = le.fit_transform(user_input_df['gender'])

od_recruitment_channel = OrdinalEncoder(categories=[["other", "sourcing", "referred"]], dtype=int)
user_input_df['recruitment_channel'] = od_recruitment_channel.fit_transform(user_input_df[['recruitment_channel']])

# Assuming 'age', 'previous_year_rating', 'length_of_service', 'avg_training_score' are numeric
standard_scaler = StandardScaler()
numeric_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
user_input_df[numeric_columns] = standard_scaler.fit_transform(user_input_df[numeric_columns])

# Display the user-entered DataFrame after encoding and scaling for testing
print("User-Entered DataFrame (Encoded and Scaled):")
print(user_input_df)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the oversampled training set (assuming 'df' is used for training)
model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the user input
user_pred = model_RF.predict(user_input_df)

# Print the prediction
print(f'Model prediction for the given input: {user_pred}')


User-Entered DataFrame (Encoded and Scaled):
   department  region  education  gender  recruitment_channel  \
0           0      22          1       0                    0   

   no_of_trainings  age  previous_year_rating  length_of_service  \
0                5  0.0                   0.0                0.0   

   KPIs_met >80%  awards_won?  avg_training_score  
0              1            0                 0.0  


NameError: name 'X_train_oversampled' is not defined

In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

# Assuming 'df' is your DataFrame containing the dataset

# Separate the classes
cls_false = df[df['is_promoted'] == 0]
cls_true = df[df['is_promoted'] == 1]

# Get input features from the user through the terminal
user_input_features = {}
for i, feature in enumerate(df.columns[:-1]):  # Assuming the last column is the target variable 'is_promoted'
    while True:
        value = input(f"Enter the value for {feature}: ")
        try:
            if i < 5:
                user_input_features[feature] = [str(value)]  # Assuming the first 5 features are strings
            else:
                user_input_features[feature] = [float(value)]  # Convert input to float for numeric features
            break  # Break the loop if conversion is successful
        except ValueError:
            print("Invalid input. Please enter a valid value.")

# Create a DataFrame with the user input
user_input_df = pd.DataFrame(user_input_features)

# Apply the same encoding and scaling transformations to the user-entered data
le = LabelEncoder()
user_input_df['department'] = le.fit_transform(user_input_df['department'])

# Assuming 'region' in the user input should be treated similarly (extracting digits)
user_input_df['region'] = user_input_df['region'].str.extract(r'(\d+)').astype(int)

od_education = OrdinalEncoder(categories=[["Below Secondary", "Bachelor's", "Master's & above"]], dtype=int)
user_input_df['education'] = od_education.fit_transform(user_input_df[['education']])

user_input_df['gender'] = le.fit_transform(user_input_df['gender'])

od_recruitment_channel = OrdinalEncoder(categories=[["other", "sourcing", "referred"]], dtype=int)
user_input_df['recruitment_channel'] = od_recruitment_channel.fit_transform(user_input_df[['recruitment_channel']])

# Assuming 'age', 'previous_year_rating', 'length_of_service', 'avg_training_score' are numeric
standard_scaler = StandardScaler()
numeric_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
user_input_df[numeric_columns] = standard_scaler.fit_transform(user_input_df[numeric_columns])

# Display the user-entered DataFrame after encoding and scaling for testing
print("User-Entered DataFrame (Encoded and Scaled):")
print(user_input_df)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the oversampled training set (assuming 'df' is used for training)
model_RF.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the user input
user_pred = model_RF.predict(user_input_df)

# Print the prediction
print(f'Model prediction for the given input: {user_pred}')


User-Entered DataFrame (Encoded and Scaled):
   department  region  education  gender  recruitment_channel  \
0           0      22          1       0                    0   

   no_of_trainings  age  previous_year_rating  length_of_service  \
0              6.0  0.0                   0.0                0.0   

   KPIs_met >80%  awards_won?  avg_training_score  
0            1.0          0.0                 0.0  


NameError: name 'X_train_oversampled' is not defined

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

def encode_and_scale_features(df):
    # Encoding
    le_department = LabelEncoder()
    df['department'] = le_department.fit_transform(df['department'])

    df['region'] = df['region'].str.extract(r'(\d+)').astype(int)

    od_education = OrdinalEncoder(categories=[["Below Secondary", "Bachelor's", "Master's & above"]], dtype=int)
    df['education'] = od_education.fit_transform(df[['education']])

    le_gender = LabelEncoder()
    df['gender'] = le_gender.fit_transform(df['gender'])

    od_recruitment_channel = OrdinalEncoder(categories=[["other", "sourcing", "referred"]], dtype=int)
    df['recruitment_channel'] = od_recruitment_channel.fit_transform(df[['recruitment_channel']])

    # Normalization
    standard_scaler = StandardScaler()
    standard_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
    df[standard_columns] = standard_scaler.fit_transform(df[standard_columns])

    return df

# Load dataset
df = pd.read_csv('train_LZdllcl.csv')
pd.set_option('display.max_columns', None)

# Handling null values
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['previous_year_rating'] = df['previous_year_rating'].fillna(df['previous_year_rating'].median())

# Drop 'employee_id'
df.drop('employee_id', axis=1, inplace=True)

# Handling outliers
columns_to_clip = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service']

for column in columns_to_clip:
    Q1 = np.percentile(df[column], 25)
    Q3 = np.percentile(df[column], 75)
    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    upp_lim = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=low_lim, upper=upp_lim)

# Preprocess the original DataFrame
df_preprocessed = encode_and_scale_features(df)

# Get input features from the user through the terminal
user_input_features = {}
user_input_df = pd.DataFrame(columns=df.columns[:-1])

for feature in df.columns[:-1]:
    while True:
        value = input(f"Enter the value for {feature}: ")
        try:
            if df[feature].dtype == 'O':
                # Handle categorical values
                user_input_features[feature] = [value]
            else:
                # Handle numeric values
                user_input_features[feature] = [float(value)]
            break  # Break the loop if conversion is successful
        except ValueError:
            print("Invalid input. Please enter a valid value.")

# Combine user-entered features with preprocessed DataFrame
user_input_df = pd.DataFrame(user_input_features)
df_combined = pd.concat([df_preprocessed, user_input_df], ignore_index=True)

# Perform preprocessing again for the combined DataFrame
df_combined = encode_and_scale_features(df_combined)

# Extract user-entered features for prediction
user_input_features_for_prediction = df_combined.iloc[-1, :-1].values.reshape(1, -1)

# Separate the user-entered features for prediction
user_input_for_prediction = df_combined.iloc[-1, -1]

# Separate the combined DataFrame into train and test sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    df_combined.drop('is_promoted', axis=1), df_combined['is_promoted'], test_size=0.2, random_state=119
)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the combined training set
model_RF.fit(X_train_combined, y_train_combined)

# Make predictions on the test set
y_pred_test = model_RF.predict(X_test_combined)

# Calculate accuracy on the original test set
accuracy_test = accuracy_score(y_test_combined, y_pred_test)
print(f'Accuracy on the combined test set: {accuracy_test}')

# Make predictions on the user input for prediction
user_pred_for_prediction = model_RF.predict(user_input_features_for_prediction)

# Print the prediction for the user input
print(f'Model prediction for the given input: {user_pred_for_prediction}')

# Evaluate the model on the user input (assuming you have the true label for user input)
# For demonstration purposes, let's assume the true label


Invalid input. Please enter a valid value.


KeyboardInterrupt: Interrupted by user

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

def encode_and_scale_features(df):
    # Encoding
    le_department = LabelEncoder()
    df['department'] = le_department.fit_transform(df['department'])

    df['region'] = df['region'].str.extract(r'(\d+)').astype(int)

    od_education = OrdinalEncoder(categories=[["Below Secondary", "Bachelor's", "Master's & above"]], dtype=int)
    df['education'] = od_education.fit_transform(df[['education']])

    le_gender = LabelEncoder()
    df['gender'] = le_gender.fit_transform(df['gender'])

    od_recruitment_channel = OrdinalEncoder(categories=[["other", "sourcing", "referred"]], dtype=int)
    df['recruitment_channel'] = od_recruitment_channel.fit_transform(df[['recruitment_channel']])

    # Normalization
    standard_scaler = StandardScaler()
    standard_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
    df[standard_columns] = standard_scaler.fit_transform(df[standard_columns])

    return df

# Load dataset
df = pd.read_csv('train_LZdllcl.csv')
pd.set_option('display.max_columns', None)

# Handling null values
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['previous_year_rating'] = df['previous_year_rating'].fillna(df['previous_year_rating'].median())

# Drop 'employee_id'
df.drop('employee_id', axis=1, inplace=True)

# Handling outliers
columns_to_clip = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service']

for column in columns_to_clip:
    Q1 = np.percentile(df[column], 25)
    Q3 = np.percentile(df[column], 75)
    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    upp_lim = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=low_lim, upper=upp_lim)

# Preprocess the original DataFrame
df_preprocessed = encode_and_scale_features(df)

# Get input features from the user through the terminal
user_input_features = {}
user_input_df = pd.DataFrame(columns=df.columns[:-1])

for feature in df.columns[:-1]:
    while True:
        value = input(f"Enter the value for {feature}: ")
        try:
            if df[feature].dtype == 'O':
                # Handle categorical values
                user_input_features[feature] = [value]
            else:
                # Handle numeric values
                user_input_features[feature] = [float(value)]
            break  # Break the loop if conversion is successful
        except ValueError:
            print("Invalid input. Please enter a valid value.")

# Combine user-entered features with preprocessed DataFrame
user_input_df = pd.DataFrame(user_input_features)
df_combined = pd.concat([df_preprocessed, user_input_df], ignore_index=True)

# Perform preprocessing again for the combined DataFrame
df_combined = encode_and_scale_features(df_combined)

# Extract user-entered features for prediction
user_input_features_for_prediction = df_combined.iloc[-1, :-1].values.reshape(1, -1)

# Separate the user-entered features for prediction
user_input_for_prediction = df_combined.iloc[-1, -1]

# Separate the combined DataFrame into train and test sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    df_combined.drop('is_promoted', axis=1), df_combined['is_promoted'], test_size=0.2, random_state=119
)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the combined training set
model_RF.fit(X_train_combined, y_train_combined)

# Make predictions on the test set
y_pred_test = model_RF.predict(X_test_combined)

# Calculate accuracy on the original test set
accuracy_test = accuracy_score(y_test_combined, y_pred_test)
print(f'Accuracy on the combined test set: {accuracy_test}')

# Make predictions on the user input for prediction
user_pred_for_prediction = model_RF.predict(user_input_features_for_prediction)

# Print the prediction for the user input
print(f'Model prediction for the given input: {user_pred_for_prediction}')

# Evaluate the model on the user input (assuming you have the true label for user input)
# For demonstration purposes, let's assume the true label


Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.


KeyboardInterrupt: Interrupted by user

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

def encode_and_scale_features(df):
    # Encoding
    le_department = LabelEncoder()
    df['department'] = le_department.fit_transform(df['department'])

    df['region'] = df['region'].str.extract(r'(\d+)').astype(int)

    od_education = OrdinalEncoder(categories=[["Below Secondary", "Bachelor's", "Master's & above"]], dtype=int)
    df['education'] = od_education.fit_transform(df[['education']])

    le_gender = LabelEncoder()
    df['gender'] = le_gender.fit_transform(df['gender'])

    od_recruitment_channel = OrdinalEncoder(categories=[["other", "sourcing", "referred"]], dtype=int)
    df['recruitment_channel'] = od_recruitment_channel.fit_transform(df[['recruitment_channel']])

    # Normalization
    standard_scaler = StandardScaler()
    standard_columns = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
    df[standard_columns] = standard_scaler.fit_transform(df[standard_columns])

    return df

# Load dataset
df = pd.read_csv('train_LZdllcl.csv')
pd.set_option('display.max_columns', None)

# Handling null values
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['previous_year_rating'] = df['previous_year_rating'].fillna(df['previous_year_rating'].median())

# Drop 'employee_id'
df.drop('employee_id', axis=1, inplace=True)

# Handling outliers
columns_to_clip = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service']

for column in columns_to_clip:
    Q1 = np.percentile(df[column], 25)
    Q3 = np.percentile(df[column], 75)
    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    upp_lim = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=low_lim, upper=upp_lim)

# Preprocess the original DataFrame
df_preprocessed = encode_and_scale_features(df)

# Get input features from the user through the terminal
user_input_features = {}
user_input_df = pd.DataFrame(columns=df.columns[:-1])

for feature in df.columns[:-1]:
    while True:
        value = input(f"Enter the value for {feature}: ")
        try:
            if df[feature].dtype == 'O':
                # Handle categorical values
                user_input_features[feature] = [value]
            else:
                # Handle numeric values
                user_input_features[feature] = [float(value)]
            break  # Break the loop if conversion is successful
        except ValueError:
            print("Invalid input. Please enter a valid value.")

# Combine user-entered features with preprocessed DataFrame
user_input_df = pd.DataFrame(user_input_features)
df_combined = pd.concat([df_preprocessed, user_input_df], ignore_index=True)

# Perform preprocessing again for the combined DataFrame
df_combined = encode_and_scale_features(df_combined)

# Extract user-entered features for prediction
user_input_features_for_prediction = df_combined.iloc[-1, :-1].values.reshape(1, -1)

# Separate the user-entered features for prediction
user_input_for_prediction = df_combined.iloc[-1, -1]

# Separate the combined DataFrame into train and test sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    df_combined.drop('is_promoted', axis=1), df_combined['is_promoted'], test_size=0.2, random_state=119
)

# Initialize the Random Forest model
model_RF = RandomForestClassifier(random_state=40)

# Train the model with the combined training set
model_RF.fit(X_train_combined, y_train_combined)

# Make predictions on the test set
y_pred_test = model_RF.predict(X_test_combined)

# Calculate accuracy on the original test set
accuracy_test = accuracy_score(y_test_combined, y_pred_test)
print(f'Accuracy on the combined test set: {accuracy_test}')

# Make predictions on the user input for prediction
user_pred_for_prediction = model_RF.predict(user_input_features_for_prediction)

# Print the prediction for the user input
print(f'Model prediction for the given input: {user_pred_for_prediction}')

# Evaluate the model on the user input (assuming you have the true label for user input)
# For demonstration purposes, let's assume the true label
if user_pred_for_prediction[0] in {0, 1}:
    true_label_user_input = 1  # Assume the true label is 1 (promoted)
    accuracy_user_input = accuracy_score([true_label_user_input], user_pred_for_prediction)
    class_report_user_input = classification_report([true_label_user_input], user_pred_for_prediction)

    print(f"Accuracy for user input: {accuracy_user_input}")
    print(f"Classification Report for user input:\n{class_report_user_input}")
else:
    print("User input cannot be evaluated.")


Invalid input. Please enter a valid value.
Invalid input. Please enter a valid value.
