Import Required Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

Import Train Dataset

In [2]:
# Read the CSV file into a pandas DataFrame
train=pd.read_csv("train.csv")

Import test Dataset

In [3]:
test = pd.read_csv("test.csv", encoding='latin1')

For output .CSV file

In [4]:
c1=test['CandidateID'].copy()

# DATA PRECROSSING 

Creating a binary feature indicating if the residence and branch are in the same pincode

In [5]:
train['Department'] = train['Department'].replace( 'Housing Finance','Affordable Housing')
train['Department'] = train['Department'].replace( 'Personal Loans','Secured Business Loan')

Creating a binary feature indicating if the residence and branch are in the same pincode

In [7]:

# train['same_pincode'] = (train['Residential Pincode'] == train['Branch Pincode']).astype(int)
train['residential_first_digit'] = train['Residential Pincode'].astype(str).str[0]
train['branch_first_digit'] = train['Branch Pincode'].astype(str).str[0]

# Compare the first digits and create a binary feature indicating if they match
train['same_first_digit'] = (train['residential_first_digit'] == train['branch_first_digit']).astype(int)
train.drop(['residential_first_digit','branch_first_digit'],axis=1,inplace=True)

In [81]:


# train['same_pincode'] = (train['Residential Pincode'] == train['Branch Pincode']).astype(int)
test['residential_first_digit'] = test['Residential Pincode'].astype(str).str[0]
test['branch_first_digit'] = test['Branch Pincode'].astype(str).str[0]

# Compare the first digits and create a binary feature indicating if they match
test['same_first_digit'] = (test['residential_first_digit'] == test['branch_first_digit']).astype(int)
test.drop(['residential_first_digit','branch_first_digit'],axis=1,inplace=True)

In [82]:
category_ratios = train.groupby('Location Code')['Performance'].mean().reset_index()


In [83]:
category_ratios1 = train.groupby('same_first_digit')['Performance'].sum().reset_index()

In [84]:
category_ratio_dict = dict(zip(category_ratios['Location Code'], category_ratios['Performance']))

# Replace each category with its corresponding ratio
train['Location Code'] = train['Location Code'].map(category_ratio_dict)

In [85]:
test['Location Code'] = test['Location Code'].map(category_ratio_dict)

***Addressing the presence of redundant classes within columns by replacing them with more appropriate categories.***

In [86]:
train['What was the average ticket size handled at your end in previous role ?'] = train['What was the average ticket size handled at your end in previous role ?'].replace(' INR 2L - INR 5L' , 'INR 2L - INR 5L')

In [87]:
test['What was the average ticket size handled at your end in previous role ?'] = test['What was the average ticket size handled at your end in previous role ?'].replace('åÊINR 2L - INR 5L' , 'INR 2L - INR 5L')

In [88]:
train.drop(['Name of your Previous Organization / Company','DOJ','Designation','CandidateID','How many Organization that you have worked before joining Piramal Finance ?'],axis=1,inplace=True)

In [89]:
test.drop(['Name of your Previous Organization / Company','DOJ','Designation','CandidateID','How many Organization that you have worked before joining Piramal Finance ?'],axis=1,inplace=True)

In [90]:
# Replace 'part' with 'Part' in the 'Category' column
train['Have you Completed your Graduation ?'] = train['Have you Completed your Graduation ?'].replace('Part time', 'Part Time')

In [91]:
# Replace 'part' with 'Part' in the 'Category' column
test['Have you Completed your Graduation ?'] = test['Have you Completed your Graduation ?'].replace('Part time', 'Part Time')

In [92]:
train['Highest Educational Qualification'] = train['Highest Educational Qualification'].replace('Graduate', 'Under Graduate')

In [93]:
test['Highest Educational Qualification'] = test['Highest Educational Qualification'].replace('Graduate', 'Under Graduate')

In [94]:
train['How did you come to know about the role at Piramal Finance ?']=train['How did you come to know about the role at Piramal Finance ?'].replace('others','Others')

In [95]:
test['How did you come to know about the role at Piramal Finance ?']=test['How did you come to know about the role at Piramal Finance ?'].replace('others','Others')

In [96]:
train['What was the average ticket size handled at your end in previous role ?'] = train['What was the average ticket size handled at your end in previous role ?'].replace('INR 5L - INR 10L', 'INR 5L - INR 15L')

In [97]:
test['What was the average ticket size handled at your end in previous role ?'] = test['What was the average ticket size handled at your end in previous role ?'].replace('INR 5L - INR 10L', 'INR 5L - INR 15L')

In [98]:
train['Which Products you are selling in your pervious role ?'].value_counts()

Which Products you are selling in your pervious role ?
Housing Loan                                                                                                  94
Others;                                                                                                       88
Housing Loans;                                                                                                78
Housing Loans;MSME / SME Loans;                                                                               78
Personal Loans;                                                                                               38
Car Loans / Used Car Loans;                                                                                   28
MSME / SME Loans;                                                                                             27
Housing Loans;Others;                                                                                         24
Used Car Loan                            

***There are too many different categories (redundant ones), so they need to be consolidated in both the train and test datasets.***

In [99]:
test['Which Products you are selling in your pervious role ?'].value_counts()

Which Products you are selling in your pervious role ?
Others;                                                                        27
Housing Loan                                                                   25
Housing Loans;                                                                 17
Housing Loans;MSME / SME Loans;                                                 9
MSME / SME Loans;                                                               8
Personal Loans;Others;                                                          7
Car Loans / Used Car Loans;                                                     7
Others;FMCG;                                                                    6
Others;Housing Loans;                                                           5
Current / Savings account [CASA];                                               5
Used Car Loan                                                                   5
Personal Loans;                            

In [100]:


# Mapping dictionary for simplification
mapping = {
    'Housing Loan': 'Housing Loan',
    'Others;': 'Others',
    'Housing Loans;': 'Housing Loan',
    'Housing Loans;MSME / SME Loans;': 'Housing Loan',
    'Personal Loans;': 'Personal Loan',
    'Car Loans / Used Car Loans;': 'Car Loan',
    'MSME / SME Loans;': 'MSME / SME Loan',
    'Housing Loans;Others;': 'Housing Loan',
    'Used Car Loan': 'Car Loan',
    'Loan Against Property/ Secured Business Loan''Loan Against Property''_Loan Against Property' : 'Loan Against Property',
    'Others;Housing Loans;': 'Housing Loan',
    'Personal Loans;Others;': 'Personal Loan',
    'Unsecure Business Loan': 'Business Loan',
    'Personal Loan': 'Personal Loan',
    'Car Loans / Used Car Loans;Others;''Car Loans / Used Car Loans;FMCG;''Car Loans / Used Car Loans;Housing Loans;Others;' : 'Car Loan',
    'MSME / SME Loans;Personal Loans;': 'MSME / SME Loan',
    'MSME / SME Loans;Housing Loans;Car Loans / Used Car Loans;Personal Loans;  ': 'Housing Loan',
    'Others;FMCG;': 'Others',
    'Others;MSME / SME Loans;': 'MSME / SME Loan',
    'Current / Savings account [CASA];': 'CASA Account',
    'Others;Current / Saving account [CASA];': 'CASA Account',
    'Others;Personal Loans;': 'Personal Loan',
    'MSME / SME Loans;Others;': 'MSME / SME Loan',
    'Housing Loans;Personal Loans;': 'Housing Loan',
    'Personal Loans;MSME / SME Loans;': 'MSME / SME Loan',
    'Others;Car Loans / Used Car Loans;': 'Car Loan',
    'Current / Saving account [CASA];Others;': 'CASA Account',
    'Current / Savings account [CASA];Others;''Others;Current / Savings account [CASA];' : 'CASA Account',
    'Housing Loans;Car Loans / Used Car Loans;': 'Car Loan',
    'FMCG;': 'Others',
    'Personal Loans;Housing Loans;': 'Housing Loan',
    'Current Account â\x80\x93 Saving Account/Others': 'CASA Account',
    'Housing Loans;Current / Saving account [CASA];''Personal Loans;Car Loans / Used Car Loans;MSME / SME Loans;Housing Loans;''Personal Loans;Current / Saving account [CASA];': 'Housing Loan',
    'Others;Current / Savings account [CASA];Personal Loans;': 'Personal Loan',
    'Car Loans / Used Car Loans;Housing Loans;': 'Car Loan',
    'Current / Saving account [CASA];Personal Loans;': 'Personal Loan',
    'Housing Loans;Personal Loans;MSME / SME Loans;Car Loans / Used Car Loans;': 'Housing Loan',
    'Housing Loans;MSME / SME Loans;Personal Loans;': 'Housing Loan',
    'FMCG;Current / Saving account [CASA];': 'Others',
    'Car Loans / Used Car Loans;Personal Loans;FMCG;MSME / SME Loans;': 'Car Loan',
    'Car Loans / Used Car Loans;Others;Personal Loans;': 'Car Loan',
    'Personal Loans;Others;Current / Savings account [CASA];FMCG;': 'Personal Loan',
    'Housing Loans;FMCG;': 'Housing Loan',
    'Housing Loans;MSME / SME Loans;Others;': 'Housing Loan',
    'Housing Loans;Others;Car Loans / Used Car Loans;': 'Car Loan',
    'Current / Saving account [CASA];Housing Loans;': 'Housing Loan',
    'Housing Loans;MSME / SME Loans;Personal Loans;FMCG;': 'Housing Loan',
    'Car Loans / Used Car Loans;MSME / SME Loans;': 'Car Loan',
    'MSME / SME Loans;Car Loans / Used Car Loans;': 'Car Loan',
    'FMCG;Others;': 'Others',
    'MSME / SME Loans;Personal Loans;Others;': 'MSME / SME Loan',
    'Current / Savings account [CASA];Housing Loans;MSME / SME Loans;': 'Housing Loan',
    'Current / Savings account [CASA];Housing Loans;Personal Loans;': 'Housing Loan',
    'MSME / SME Loans;Current / Saving account [CASA];': 'MSME / SME Loan',
    'MSME / SME Loans;Housing Loans;Personal Loans;': 'Housing Loan',
    'Current / Savings account [CASA];Personal Loans;Car Loans / Used Car Loans;MSME / SME Loans;': 'Personal Loan'
}

# Replace original categories with simplified categories
train['Which Products you are selling in your pervious role ?'] = train['Which Products you are selling in your pervious role ?'].replace(mapping)

# Print the frequency of each simplified category
print(train['Which Products you are selling in your pervious role ?'].value_counts())


Which Products you are selling in your pervious role ?
Housing Loan                                                                                                  314
Others                                                                                                        104
Personal Loan                                                                                                  88
MSME / SME Loan                                                                                                66
Car Loan                                                                                                       66
CASA Account                                                                                                   26
Loan Against Property/ Secured Business Loan                                                                   23
Business Loan                                                                                                  18
Car Loans / Used Car Loans;Others

In [101]:
mapping_products = {
    'Housing Loan': 'Housing Loan',
    'Others': 'Others',
    'Personal Loan': 'Personal Loan',
    'MSME / SME Loan': 'MSME / SME Loan',
    'Car Loan': 'Car Loan',
    'CASA Account': 'CASA Account',
    'Loan Against Property/ Secured Business Loan': 'Loan Against Property',    
    'Business Loan': 'Business Loan',
    'Car Loans / Used Car Loans;Others;': 'Car Loan',
    'MSME / SME Loans;Housing Loans;': 'Others',
    'Current / Savings account [CASA];Others;': 'Others',
    'Housing Loans;Current / Saving account [CASA];': 'Housing Loan',
    'Personal Loans;Current / Saving account [CASA];': 'Personal Loan',
    'Current / Savings account [CASA];Housing Loans;MSME / SME Loans;Personal Loans;Car Loans / Used Car Loans;': 'Others'
}


# Replace original categories with simplified categories
train['Which Products you are selling in your pervious role ?'] = train['Which Products you are selling in your pervious role ?'].replace(mapping_products)

# Print the frequency of each simplified category
print(train['Which Products you are selling in your pervious role ?'].value_counts())

Which Products you are selling in your pervious role ?
Housing Loan             316
Others                   120
Personal Loan             89
Car Loan                  81
MSME / SME Loan           66
CASA Account              26
Loan Against Property     23
Business Loan             18
Name: count, dtype: int64


In [102]:
# test['Which Products you are selling in your pervious role ?'] = test['Which Products you are selling in your pervious role ?'].replace(mapping)


In [103]:
mapping2 = {
    'Car Loans / Used Car Loans;Others;': 'Car Loan',
    'Car Loans / Used Car Loans;Personal Loans;': 'Car Loan',
    'Loan Against Property/ Secured Business Loan': 'Loan Against Property',
    'MSME / SME Loans;Housing Loans;': 'Others',
    'Others;Current / Savings account [CASA];': 'Others',
    'Car Loans / Used Car Loans;FMCG;': 'Car Loan',
    'Current / Savings account [CASA];Others;': 'Others',
    'Car Loans / Used Car Loans;Housing Loans;Others;': 'Car Loan',
    'Personal Loans;Car Loans / Used Car Loans;MSME / SME Loans;Housing Loans;': 'Car Loan',
    'Personal Loans;Current / Saving account [CASA];': 'Personal Loan',
    'Car Loans / Used Car Loans;Current / Savings account [CASA];Personal Loans;': 'Car Loan',
    'Current / Savings account [CASA];Personal Loans;Housing Loans;': 'Others',
    'Car Loans / Used Car Loans;Current / Saving account [CASA];': 'Car Loan',
    'Housing Loans;MSME / SME Loans;Current / Savings account [CASA];': 'Others',
    'Current Account Ì¢åÛåÒ Saving Account/Others': 'Others',
    'MSME / SME Loans;Housing Loans;Personal Loans;Car Loans / Used Car Loans;': 'Others'
}
test['Which Products you are selling in your pervious role ?'] = test['Which Products you are selling in your pervious role ?'].replace(mapping2)
print(test['Which Products you are selling in your pervious role ?'].value_counts())

Which Products you are selling in your pervious role ?
Others;                                           27
Housing Loan                                      25
Housing Loans;                                    17
Car Loan                                          10
Housing Loans;MSME / SME Loans;                    9
MSME / SME Loans;                                  8
Others                                             8
Car Loans / Used Car Loans;                        7
Personal Loans;Others;                             7
Others;FMCG;                                       6
Others;Housing Loans;                              5
Used Car Loan                                      5
Unsecure Business Loan                             5
Current / Savings account [CASA];                  5
Personal Loans;                                    5
Others;Personal Loans;                             4
Housing Loans;Others;                              4
Others;Current / Saving account [CASA];     

In [104]:
test['Which Products you are selling in your pervious role ?'] = test['Which Products you are selling in your pervious role ?'].replace(mapping)

***frequencies of different categories for each categorical column***

In [105]:
categorical_cols = train.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print(f"Column: {col}")
    print(train[col].value_counts())
    print()

Column: Have you Completed your Graduation ?
Have you Completed your Graduation ?
Full Time    614
Part Time     30
Name: count, dtype: int64

Column: Highest Educational Qualification
Highest Educational Qualification
Under Graduate     454
Post Graduate      179
Diploma Holders      6
Others               5
Name: count, dtype: int64

Column: Previous Industry worked with [before joining Piramal]
Previous Industry worked with [before joining Piramal]
NBFC         380
Banking      188
Others        82
others        66
Insurance     15
Non NBFC      14
Name: count, dtype: int64

Column: Average Incentive [per month] earned in your pervious company ?
Average Incentive [per month] earned in your pervious company ?
Above 10K       301
3K-7K           209
7K-10K          120
Nil              72
Less than 3K     43
Name: count, dtype: int64

Column: How did you come to know about the role at Piramal Finance ?
How did you come to know about the role at Piramal Finance ?
Referral              

In [106]:
train['Highest Educational Qualification'] = train['Highest Educational Qualification'].replace('Diploma Holders', 'Others')

***some statistical information about the training dataset***

In [107]:
train.describe()

Unnamed: 0,Total no of years Experience [before joining Piramal],How many are earning family members ? [Other then yourself]2,How many members are dependent on you ?,Location Code,Residential Pincode,Branch Pincode,Performance,same_first_digit
count,671.0,563.0,745.0,745.0,745.0,743.0,745.0,745.0
mean,46251.7,6282.300178,2.573154,0.395973,443646.022819,441313.527591,0.395973,0.958389
std,1197978.0,48964.725585,1.447218,0.29642,188146.764093,188283.541525,0.489387,0.199832
min,0.0,0.0,0.0,0.0,101213.0,110019.0,0.0,0.0
25%,2.0,1.0,2.0,0.181818,274509.0,273001.0,0.0,1.0
50%,4.0,1.0,2.0,0.384615,500008.0,500014.0,0.0,1.0
75%,6.0,2.0,3.0,0.538462,605003.0,602552.0,1.0,1.0
max,31032020.0,600000.0,10.0,1.0,847224.0,842002.0,1.0,1.0


***Handling the missing values in the categorical columns and Droping the columns***

In [108]:

train['Have you Completed your Graduation ?'] = train['Have you Completed your Graduation ?'].fillna('Full Time')


In [109]:
test['Have you Completed your Graduation ?'] = test['Have you Completed your Graduation ?'].fillna('Full Time')

In [110]:
train['Highest Educational Qualification']=train['Highest Educational Qualification'].fillna('Under Graduate')

In [111]:
test['Highest Educational Qualification']=test['Highest Educational Qualification'].fillna('Under Graduate')

In [112]:
train.drop(['Location Code','Residential Pincode','Branch Pincode'],axis=1,inplace=True)

In [113]:
test.drop(['Location Code','Residential Pincode','Branch Pincode'],axis=1,inplace=True)

In [114]:
# Calculate the median of the column
median_value = train['Total no of years Experience [before joining Piramal]'].median()

# Replace values greater than 50 with the median
train.loc[train['Total no of years Experience [before joining Piramal]'] > 50, 'Total no of years Experience [before joining Piramal]'] = median_value

In [115]:
# Calculate the median of the column
median_value = train['Total no of years Experience [before joining Piramal]'].median()

# Replace values greater than 50 with the median
test.loc[test['Total no of years Experience [before joining Piramal]'] > 50, 'Total no of years Experience [before joining Piramal]'] = median_value

In [116]:

train['Total no of years Experience [before joining Piramal]'] = train['Total no of years Experience [before joining Piramal]'].fillna(0)


In [117]:

test['Total no of years Experience [before joining Piramal]'] = test['Total no of years Experience [before joining Piramal]'].fillna(0)


In [118]:
median_value = train['How many are earning family members ? [Other then yourself]2'].median()
# Replace values greater than 50 with the median
train.loc[train['How many are earning family members ? [Other then yourself]2'] > 5,'How many are earning family members ? [Other then yourself]2'] = median_value

In [119]:
median_value = train['How many are earning family members ? [Other then yourself]2'].median()
# Replace values greater than 50 with the median
test.loc[test['How many are earning family members ? [Other then yourself]2'] > 5,'How many are earning family members ? [Other then yourself]2'] = median_value

In [120]:
train['How many are earning family members ? [Other then yourself]2'] =train['How many are earning family members ? [Other then yourself]2'].fillna(median_value)

In [121]:
test['How many are earning family members ? [Other then yourself]2'] =test['How many are earning family members ? [Other then yourself]2'].fillna(median_value)

In [122]:
train['Which Products you are selling in your pervious role ?']=train['Which Products you are selling in your pervious role ?'].fillna('Others')

In [123]:
test['Which Products you are selling in your pervious role ?']=test['Which Products you are selling in your pervious role ?'].fillna('Others')

In [124]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745 entries, 0 to 744
Data columns (total 14 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   Have you Completed your Graduation ?                                     745 non-null    object 
 1   Highest Educational Qualification                                        745 non-null    object 
 2   Total no of years Experience [before joining Piramal]                    745 non-null    float64
 3   Previous Industry worked with [before joining Piramal]                   745 non-null    object 
 4   Average Incentive [per month] earned in your pervious company ?          745 non-null    object 
 5   How did you come to know about the role at Piramal Finance ?             745 non-null    object 
 6   Which Products you are selling in your pervious role ?                   7

***Dropping the unnecessary columns from the dataset***

In [125]:
train.drop('How did you come to know about the role at Piramal Finance ?',axis=1,inplace=True)
test.drop('How did you come to know about the role at Piramal Finance ?',axis=1,inplace=True)


In [126]:

train.drop('Previous Industry worked with [before joining Piramal]',axis=1,inplace=True)
test.drop('Previous Industry worked with [before joining Piramal]',axis=1,inplace=True)

Displaying the list of categorical column names

In [127]:
train.to_csv('TRAIN_FINAL.CSV', index=True, encoding='utf-8')


In [128]:
cat_cols = []

# Iterate over each column in the DataFrame
for col in train.columns:
  
    if train[col].dtype == 'object' :
    
        cat_cols.append(col)


print("Categorical columns:", cat_cols)

Categorical columns: ['Have you Completed your Graduation ?', 'Highest Educational Qualification', 'Average Incentive [per month] earned in your pervious company ?', 'Which Products you are selling in your pervious role ?', 'What was the average ticket size handled at your end in previous role ?', 'How many members are there in your family ?', 'Department']


Generates dummy variables for categorical columns in the 'train' DataFrame, utilizing the 'cat_cols' list to specify which columns to encode. The resulting DataFrame 'train_e' contains the original columns along with the encoded categorical variables.

In [129]:
train_e = pd.get_dummies(train, columns=cat_cols)


In [130]:
test_e = pd.get_dummies(test, columns=cat_cols)

In [131]:
# Get boolean columns
boolean_cols = train_e.select_dtypes(include=bool).columns.tolist()

# Convert boolean columns to 0/1
train_e[boolean_cols] = train_e[boolean_cols].astype(int)

In [132]:
# Get boolean columns
boolean_cols = test_e.select_dtypes(include=bool).columns.tolist()

# Convert boolean columns to 0/1
test_e[boolean_cols] = test_e[boolean_cols].astype(int)

SPLIT THE DATA INTO TRAIN AND TEST FORM  THE TRAIN DATA

In [133]:
y=train_e['Performance'].copy()

In [134]:
X=train_e.drop('Performance',axis=1)

In [135]:
cat_cols = []

# Iterate over each column in the DataFrame
for col in train_e.columns:
    # Check if the column's data type is object or category
    if train_e[col].dtype == 'object' :
        # If yes, add the column name to the list of categorical columns
        cat_cols.append(col)

# Display the list of categorical column names
print("Categorical columns:", cat_cols)

Categorical columns: []


***This code implements random oversampling on the training data to address class imbalance. It generates synthetic samples for the minority class(es) to balance the class distribution.***

In [136]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate the RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Perform oversampling on the training data
X1, y1 = oversampler.fit_resample(X, y)

***split the data into training and testing sets (80% train, 20% test)***

In [137]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2)

Information about X_train Data

In [138]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 720 entries, 653 to 427
Data columns (total 39 columns):
 #   Column                                                                                     Non-Null Count  Dtype  
---  ------                                                                                     --------------  -----  
 0   Total no of years Experience [before joining Piramal]                                      720 non-null    float64
 1   How many are earning family members ? [Other then yourself]2                               720 non-null    float64
 2   How many members are dependent on you ?                                                    720 non-null    int64  
 3   same_first_digit                                                                           720 non-null    int32  
 4   Have you Completed your Graduation ?_Full Time                                             720 non-null    int32  
 5   Have you Completed your Graduation ?_Part Time       

# Find Best Parameters and the Best Score

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2)

# Define the parameter grid to search
param_distributions = {
    'n_estimators': np.arange(50, 501, 50),
    'max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5],
    'max_depth': np.arange(5, 51, 5),
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Perform randomized search with scoring based on f1 weighted
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, n_iter=400, cv=20, 
                                   scoring='f1_weighted', verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters:", random_search.best_params_)
print("Best weighted F1 score:", random_search.best_score_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print("Weighted F1 score on test set:", weighted_f1)


Fitting 20 folds for each of 400 candidates, totalling 8000 fits


1480 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
745 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib

Best parameters: {'n_estimators': 100, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 25, 'bootstrap': False}
Best weighted F1 score: 0.734410376492049
Weighted F1 score on test set: 0.6992577931716972


In [140]:
# Create the Random Forest model with the best parameters
best_rf_model = RandomForestClassifier(n_estimators=100,
                                       min_samples_split=2,
                                       min_samples_leaf=1,
                                       max_features='sqrt',
                                       max_depth=20,
                                       bootstrap=False,
                                       random_state=42)

# Fit the model on the training data
best_rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = best_rf_model.predict(test_e)

# Print the predictions
print("Predictions on test set:",y_pred_test)

Predictions on test set: [1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0
 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
 0 1 0 0 1 1 0 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1
 0 0]


In [141]:
c2=pd.DataFrame(y_pred_test)

In [142]:
c1

0      EMP0521
1      EMP0613
2      EMP0136
3      EMP0351
4      EMP0049
        ...   
182    EMP0401
183    EMP0408
184    EMP0248
185    EMP0148
186    EMP0422
Name: CandidateID, Length: 187, dtype: object

In [143]:
c2.columns = ['Performance']
s=pd.concat([c1, c2], axis=1)

In [145]:
s.to_csv('output7.csv', index=False)