In [91]:
# Importing the required libraries
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
import pickle

In [92]:
# Setting the display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [93]:
df_train = pd.read_csv('dataset/train.csv')           # Reading the train dataset and store it in df_train variable
df_test = pd.read_csv('dataset/test.csv')             # Reading the test dataset and store it in df_test variable
df_holdout = pd.read_csv('dataset/holdout.csv')       # Reading the holdout dataset and store it in df_holdout variable

##### Data Preprocessing

In [94]:
# Displaying the shape of the train, test, holdout datasets
print('Initial shape of train dataset:', df_train.shape)
print('Initial shape of train dataset:', df_test.shape)
print('Initial shape of train dataset:', df_holdout.shape)

Initial shape of train dataset: (20000, 15)
Initial shape of train dataset: (5000, 15)
Initial shape of train dataset: (500, 14)


In [95]:
# Displaying the first two rows of the train dataset
df_train.head(2)

Unnamed: 0,company_id,company_name,industry,funding_rounds,total_funding,last_funding_date,job_postings_30d,employee_growth_pct,hiring_roles,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,is_hot_lead
0,COMP_024008,Moreno and Sons,Retail,1.911533,3795597.04,2021-07-14,10.995177,-8.355216,Operations,11.5,-0.2,-5930.620375,1317,45547164.48,0
1,COMP_007976,"Morales, Hubbard and Mcdonald",Healthcare,3.221005,5792195.29,2020-05-23,5.052385,20.602718,Marketing,23.9,10.5,2106.252833,1734,34753171.74,0


In [96]:
# Displaying the information of the train dataset
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   company_id                 20000 non-null  object 
 1   company_name               20000 non-null  object 
 2   industry                   20000 non-null  object 
 3   funding_rounds             20000 non-null  float64
 4   total_funding              20000 non-null  float64
 5   last_funding_date          20000 non-null  object 
 6   job_postings_30d           20000 non-null  float64
 7   employee_growth_pct        20000 non-null  float64
 8   hiring_roles               20000 non-null  object 
 9   industry_growth_rate       20000 non-null  float64
 10  regional_employment_trend  20000 non-null  float64
 11  funding_per_employee       20000 non-null  float64
 12  days_since_last_funding    20000 non-null  int64  
 13  growth_momentum            20000 non-null  flo

In [97]:
# Displaying the summary statistics of the train dataset
df_train.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,funding_rounds,total_funding,job_postings_30d,employee_growth_pct,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,is_hot_lead
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,2.987775,4171424.0,10.002013,19.954379,17.456325,5.13381,inf,909.826,41885350.0,0.14925
std,1.454144,3088695.0,3.328233,17.53204,7.232038,5.767425,,527.341786,35255970.0,0.356344
min,0.686082,100447.6,0.926065,-12.491863,5.0,-5.0,-789402.6,1.0,295727.1,0.0
25%,1.954142,1719901.0,7.640501,4.824062,11.2,0.2,318.3275,455.0,15321060.0,0.0
50%,2.869536,3533375.0,9.73312,19.865427,17.4,5.2,1181.334,905.0,32794940.0,0.0
75%,3.86876,5976700.0,12.056417,34.704816,23.7,10.2,2786.092,1363.0,58882610.0,0.0
max,13.013271,21414840.0,29.457835,64.173316,30.0,15.0,inf,1827.0,323982600.0,1.0


In [98]:
# Checking for missing values in the train dataset
df_train.isnull().sum()

company_id                   0
company_name                 0
industry                     0
funding_rounds               0
total_funding                0
last_funding_date            0
job_postings_30d             0
employee_growth_pct          0
hiring_roles                 0
industry_growth_rate         0
regional_employment_trend    0
funding_per_employee         0
days_since_last_funding      0
growth_momentum              0
is_hot_lead                  0
dtype: int64

In [99]:
# Checking for duplicate rows in the train dataset
df_train.duplicated().sum()

np.int64(0)

In [100]:
# Checking for the unique values count in each column of the train dataset
for cols in df_train.columns:
    print(cols, df_train[cols].nunique())
    print('-'*70)

company_id 20000
----------------------------------------------------------------------
company_name 16366
----------------------------------------------------------------------
industry 5
----------------------------------------------------------------------
funding_rounds 20000
----------------------------------------------------------------------
total_funding 20000
----------------------------------------------------------------------
last_funding_date 1827
----------------------------------------------------------------------
job_postings_30d 20000
----------------------------------------------------------------------
employee_growth_pct 19973
----------------------------------------------------------------------
hiring_roles 4
----------------------------------------------------------------------
industry_growth_rate 251
----------------------------------------------------------------------
regional_employment_trend 201
------------------------------------------------------------

In [101]:
# Checking for the unique values in each column of the train dataset
for cols in df_train.columns:
    print(cols, df_train[cols].unique())
    print('-'*70)

company_id ['COMP_024008' 'COMP_007976' 'COMP_021959' ... 'COMP_006482' 'COMP_024460'
 'COMP_004830']
----------------------------------------------------------------------
company_name ['Moreno and Sons' 'Morales, Hubbard and Mcdonald' 'Strickland-Wright' ...
 'Greer-Hansen' 'Howell-Jacobs' 'Barnett, Brown and Harris']
----------------------------------------------------------------------
industry ['Retail' 'Healthcare' 'Tech' 'Fintech' 'SaaS']
----------------------------------------------------------------------
funding_rounds [1.9115327  3.22100461 3.97596572 ... 1.06149696 1.78194521 0.86431344]
----------------------------------------------------------------------
total_funding [3795597.04 5792195.29  191991.   ... 2748717.89 3305299.14 2264050.12]
----------------------------------------------------------------------
last_funding_date ['2021-07-14' '2020-05-23' '2020-05-07' ... '2021-04-29' '2023-01-14'
 '2022-05-13']
-------------------------------------------------------------

In [102]:
# Checking for the columns in the train dataset
df_train.columns

Index(['company_id', 'company_name', 'industry', 'funding_rounds',
       'total_funding', 'last_funding_date', 'job_postings_30d',
       'employee_growth_pct', 'hiring_roles', 'industry_growth_rate',
       'regional_employment_trend', 'funding_per_employee',
       'days_since_last_funding', 'growth_momentum', 'is_hot_lead'],
      dtype='object')

In [103]:
# Checking for the value counts of each column in the train dataset
for cols in df_train.columns:
    print(df_train[cols].value_counts())
    print('-'*70)

company_id
COMP_010661    1
COMP_008065    1
COMP_009044    1
COMP_012470    1
COMP_016643    1
COMP_013745    1
COMP_018250    1
COMP_020642    1
COMP_022309    1
COMP_007200    1
COMP_008386    1
COMP_001382    1
COMP_023491    1
COMP_021680    1
COMP_009743    1
COMP_001335    1
COMP_004741    1
COMP_022747    1
COMP_015003    1
COMP_004449    1
COMP_020255    1
COMP_022499    1
COMP_007339    1
COMP_007652    1
COMP_010971    1
COMP_021474    1
COMP_006050    1
COMP_003702    1
COMP_015209    1
COMP_016385    1
COMP_015317    1
COMP_010872    1
COMP_002223    1
COMP_015437    1
COMP_002483    1
COMP_008538    1
COMP_013935    1
COMP_020069    1
COMP_010963    1
COMP_023121    1
COMP_012560    1
COMP_022201    1
COMP_009813    1
COMP_024429    1
COMP_010480    1
COMP_019940    1
COMP_018413    1
COMP_003756    1
COMP_006995    1
COMP_014927    1
COMP_022460    1
COMP_003076    1
COMP_024667    1
COMP_004265    1
COMP_008816    1
COMP_008919    1
COMP_012778    1
COMP_017879    1
COM

In [104]:
# Setting the company_id as the index of the train and test datasets
df_train.set_index('company_id', inplace=True)
df_test.set_index('company_id', inplace=True)

In [None]:
# Store the dataframes in a list
data_list = [df_train, df_test, df_holdout]

In [106]:
# Displaying the sum of the inf values in the 'funding_per_employee' column of the train, test, holdout dataset
for df in data_list:
    posinf_count = np.isposinf(df['funding_per_employee']).sum()
    neginf_count = np.isneginf(df['funding_per_employee']).sum()
    print(posinf_count, neginf_count)

35 0
8 0
2 0


In [107]:
# Replacing the inf values in the 'funding_per_employee' column of the train, test, holdout dataset with nan
for df in data_list:
    df['funding_per_employee'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Dropping the nan values in the 'funding_per_employee' column of the train, test, holdout dataset
for df in data_list:
    df.dropna(subset=['funding_per_employee'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['funding_per_employee'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [108]:
# Displaying the shape of the train, test, holdout dataset after removing the nan values
for df in data_list:
    print('Size of the dataset after removing inf values:', df.shape)

Size of the dataset after removing inf values: (19965, 14)
Size of the dataset after removing inf values: (4992, 14)
Size of the dataset after removing inf values: (498, 14)


In [109]:
# Displaying the sum of the inf values in the 'funding_per_employee' column of the train, test, holdout dataset
for df in data_list:
    posinf_count = np.isposinf(df['funding_per_employee']).sum()
    neginf_count = np.isneginf(df['funding_per_employee']).sum()
    print(posinf_count, neginf_count)

0 0
0 0
0 0


In [110]:
# Change the datatype of the 'last_funding_date' column to datetime
for df in data_list:
    df['last_funding_date'] = pd.to_datetime(df['last_funding_date'])

In [111]:
# Extracting features from datetime column
for df in data_list:
    df['last_funding_year'] = df['last_funding_date'].dt.year
    df['last_funding_month'] = df['last_funding_date'].dt.month
    df['last_funding_day'] = df['last_funding_date'].dt.day
    df['last_funding_day_of_week'] = df['last_funding_date'].dt.dayofweek
    df['last_funding_day_of_year'] = df['last_funding_date'].dt.dayofyear
    df['last_funding_week_of_year'] = df['last_funding_date'].dt.isocalendar().week
    df['last_funding_is_weekend'] = df['last_funding_date'].dt.dayofweek >= 5
    
for df in data_list:
    if 'last_funding_date' in df.columns:
        print('yes')
        df.drop(columns=['last_funding_date'], inplace=True)


yes
yes
yes


In [115]:
# Store the cleaned train dataset in a csv file
df_train.to_csv("dataset/cleaned_train.csv", index=False)

In [None]:
# Detect outliers using IQR method
numerical_cols = df_train.select_dtypes(include=['number']).columns
for col in numerical_cols:
    Q1 = df_train[col].quantile(0.25)
    Q3 = df_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df_train[(df_train[col] < lower_bound) | (df_train[col] > upper_bound)]
    print('Outliers in', col, 'column:', len(outliers))

Outliers in funding_rounds column: 330
Outliers in total_funding column: 327
Outliers in job_postings_30d column: 219
Outliers in employee_growth_pct column: 0
Outliers in industry_growth_rate column: 0
Outliers in regional_employment_trend column: 0
Outliers in funding_per_employee column: 4507
Outliers in days_since_last_funding column: 0
Outliers in growth_momentum column: 680
Outliers in is_hot_lead column: 2985
Outliers in last_funding_year column: 0
Outliers in last_funding_month column: 0
Outliers in last_funding_day column: 0
Outliers in last_funding_day_of_week column: 0
Outliers in last_funding_day_of_year column: 0
Outliers in last_funding_week_of_year column: 0


In [117]:
# Detect outliers using z-score method
outlier_stats = []
for col in numerical_cols:
    z_scores = zscore(df_train[col])
    outliers = (abs(z_scores) > 3).sum()
    outlier_stats.append({'col': col,'outliers_count': outliers})
print(pd.DataFrame(outlier_stats))
    

                          col  outliers_count
0              funding_rounds             173
1               total_funding             174
2            job_postings_30d              90
3         employee_growth_pct               0
4        industry_growth_rate               0
5   regional_employment_trend               0
6        funding_per_employee             234
7     days_since_last_funding               0
8             growth_momentum             298
9                 is_hot_lead               0
10          last_funding_year               0
11         last_funding_month               0
12           last_funding_day               0
13   last_funding_day_of_week               0
14   last_funding_day_of_year               0
15  last_funding_week_of_year               0


In [None]:
# Checking the value counts of the 'is_hot_lead' column
df_train['is_hot_lead'].value_counts()  

# Data is undersampled, so need to balance the data before training the model

is_hot_lead
0    16980
1     2985
Name: count, dtype: int64

In [None]:
# Performing Label Encoding on the categorical columns
label_encoders = {}
encoded_mappings = {}

# Label encoding the categorical columns in the train dataset using fit_trainform
for col in df_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder() 
    df_train[col] = le.fit_transform(df_train[col])
    label_encoders[col] = le 
    encoded_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_))) 

# Label encoding the categorical columns in the test and holdout dataset using transform
for df in [df_test, df_holdout]:
    for col in df.select_dtypes(include=['object']).columns:
        if col in label_encoders: 
            df[col] = df[col].map(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1) 


encoded_mappings

{'company_name': {'Abbott Group': np.int64(0),
  'Abbott Inc': np.int64(1),
  'Abbott LLC': np.int64(2),
  'Abbott, Bullock and Jackson': np.int64(3),
  'Abbott, Fitzgerald and Mitchell': np.int64(4),
  'Abbott, Johnson and Lee': np.int64(5),
  'Abbott, Martinez and Foster': np.int64(6),
  'Abbott, Mckee and King': np.int64(7),
  'Abbott-Boone': np.int64(8),
  'Abbott-Parks': np.int64(9),
  'Abbott-Smith': np.int64(10),
  'Acevedo Group': np.int64(11),
  'Acevedo Inc': np.int64(12),
  'Acevedo LLC': np.int64(13),
  'Acevedo Ltd': np.int64(14),
  'Acevedo, Miller and Lowe': np.int64(15),
  'Acevedo, Oliver and Gray': np.int64(16),
  'Acevedo-Bridges': np.int64(17),
  'Acevedo-Pope': np.int64(18),
  'Acevedo-Robbins': np.int64(19),
  'Acosta, Brown and Burton': np.int64(20),
  'Acosta, Campbell and Johnson': np.int64(21),
  'Acosta, Evans and Lewis': np.int64(22),
  'Acosta, Pena and Ewing': np.int64(23),
  'Acosta, Phillips and Reyes': np.int64(24),
  'Acosta, Roberts and Dodson': np.in

In [None]:
# Displaying the shape of the train, test, holdout dataset
print('holdout',df_holdout.shape)
print('test',df_test.shape)
print('train',df_train.shape)

holdout (498, 20)
test (4992, 20)
train (19965, 20)


In [None]:
# Dataset contains datatype like UInt32(1), bool(1), float64(6), int32(5), int64(6), object(1), so need to convert all columns to float64 excluding 'company_id', except 'is_hot_lead' which should be int
for df in data_list:
    for col in df.columns:
        if col == 'is_hot_lead':
            df[col] = df[col].astype(int)  # Convert 'is_hot_lead' to int
        elif col != 'company_id':
            df[col] = df[col].astype(float)  # Convert all other columns to float64


In [None]:
# Standardize the numerical features using StandardScaler (fit_transform on train, transform on test and holdout)

# Initialize StandardScaler
scaler = StandardScaler()

# Select all numerical features **except `company_id` and `is_hot_lead`**
num_features = df_train.select_dtypes(include=['float64', 'int64']).columns
num_features = [col for col in num_features if col not in ['company_id', 'is_hot_lead']]

# Fit scaler on all numerical features **at once**
df_train[num_features] = scaler.fit_transform(df_train[num_features])

# Store scaling parameters
scaling_parameters = {col: {'mean': scaler.mean_[i], 'std': scaler.scale_[i]} for i, col in enumerate(num_features)}
print("Scaling Parameters:", scaling_parameters)

# Save the fitted scaler
sub_folder = "saved_scaler"
scaler_filename = "standard_scaler.pkl"
os.makedirs(sub_folder, exist_ok=True)
scaler_file_path = os.path.join(sub_folder, scaler_filename)

with open(scaler_file_path, 'wb') as file:
    pickle.dump(scaler, file)

print("File path of stored StandardScaler reference -->", scaler_file_path)

# Transform test and holdout data using the fitted scaler
for df in [df_test, df_holdout]:
    df[num_features] = scaler.transform(df[num_features])

# Print the features used during training
train_features = scaler.feature_names_in_
print("Features used during training:", train_features)


Scaling Parameters: {'company_name': {'mean': np.float64(8217.34745805159), 'std': np.float64(4715.5403062426285)}, 'industry': {'mean': np.float64(2.009917355371901), 'std': np.float64(1.4086071570515635)}, 'funding_rounds': {'mean': np.float64(2.9880178797665873), 'std': np.float64(1.454147455594961)}, 'total_funding': {'mean': np.float64(4171966.8728034063), 'std': np.float64(3088993.538251528)}, 'job_postings_30d': {'mean': np.float64(10.001013399861531), 'std': np.float64(3.328108432423979)}, 'employee_growth_pct': {'mean': np.float64(19.99112552854933), 'std': np.float64(17.52496114969641)}, 'hiring_roles': {'mean': np.float64(1.4973703981968445), 'std': np.float64(1.119306964794637)}, 'industry_growth_rate': {'mean': np.float64(17.45732531930879), 'std': np.float64(7.232125512434146)}, 'regional_employment_trend': {'mean': np.float64(5.134295016278488), 'std': np.float64(5.767665980196943)}, 'funding_per_employee': {'mean': np.float64(1410.4208845563755), 'std': np.float64(35676

##### Model Selection

In [None]:
#Import required libraries for training the data and evaluating the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, make_scorer
from imblearn.over_sampling import SMOTE

#Initialize the classification models
classification_type_models = {
    "LogisticRegression": LogisticRegression(class_weight="balanced"),
    "SVC": SVC(kernel="rbf", probability=True, class_weight="balanced"),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced"),
    "GradientBoostingClassifier": GradientBoostingClassifier(n_estimators=100, random_state=42),  # No class_weight
    "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=5),  # No class_weight
    "GaussianNB": GaussianNB(),  # No class_weight
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),  # No class_weight
    "XGBoostClassifier": XGBClassifier(n_estimators=100, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss", random_state=42)
}


##### Splitting the data

In [None]:
# Splitting the train dataset into X_train and y_train
X_train = df_train.drop(columns=['is_hot_lead'])
y_train = df_train['is_hot_lead']

# Splitting the test dataset into X_test and y_test
X_test = df_test.drop(columns=['is_hot_lead'])
y_test = df_test['is_hot_lead']

In [None]:
df_train['is_hot_lead'].value_counts()  # Checking the value counts of the 'is_hot_lead' column before sampling

is_hot_lead
0    16980
1     2985
Name: count, dtype: int64

In [None]:
# To balance the data, we can use SMOTE (Synthetic Minority Over-sampling Technique) to oversample the minority class
smote = SMOTE(random_state=42) # Initialize SMOTE
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
y_resampled.value_counts() # Checking the value counts of the 'is_hot_lead' column after resampling

is_hot_lead
0    16980
1    16980
Name: count, dtype: int64

##### Model Training

In [None]:
# Model Training and Evaluation for Multiple Models
model_performance = {}
classification_reports = {}
confusion_matrices = {}

for model_name, model in classification_type_models.items():
    print(f"Training {model_name}...")
    model.fit(X_resampled, y_resampled)
    print(f"Training {model_name}... Done")
    
    y_pred = model.predict(X_test)
    
    print(f"Evaluating {model_name}...")
    performance = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    }
    
    model_performance[model_name] = performance
    
    # Convert Classification Report to DataFrame
    class_report = classification_report(y_test, y_pred, output_dict=True)
    classification_reports[model_name] = pd.DataFrame(class_report).transpose()
    
    # Convert Confusion Matrix to DataFrame
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_df = pd.DataFrame(conf_matrix, 
                                  index=[f"Actual_{i}" for i in range(len(conf_matrix))], 
                                  columns=[f"Predicted_{i}" for i in range(len(conf_matrix))])
    confusion_matrices[model_name] = conf_matrix_df
    
    print(f"Performance of {model_name}:", performance)

# Find the best model based on F1-Score
best_model_name = max(model_performance, key=lambda x: model_performance[x]['f1_score'])
best_model = classification_type_models[best_model_name]

print(f"\nBest Model: {best_model_name}")
print("Performance:", model_performance[best_model_name])

Training LogisticRegression...
Training LogisticRegression... Done
Evaluating LogisticRegression...
Performance of LogisticRegression: {'accuracy': 0.8221153846153846, 'f1_score': np.float64(0.5877437325905293), 'precision': np.float64(0.44957386363636365), 'recall': np.float64(0.8485254691689008)}
Training SVC...
Training SVC... Done
Evaluating SVC...
Performance of SVC: {'accuracy': 0.928886217948718, 'f1_score': np.float64(0.7809993830968538), 'precision': np.float64(0.7234285714285714), 'recall': np.float64(0.8485254691689008)}
Training DecisionTreeClassifier...
Training DecisionTreeClassifier... Done
Evaluating DecisionTreeClassifier...
Performance of DecisionTreeClassifier: {'accuracy': 0.921875, 'f1_score': np.float64(0.7410358565737052), 'precision': np.float64(0.7342105263157894), 'recall': np.float64(0.7479892761394102)}
Training RandomForestClassifier...
Training RandomForestClassifier... Done
Evaluating RandomForestClassifier...
Performance of RandomForestClassifier: {'accu

Parameters: { "use_label_encoder" } are not used.



Training XGBoostClassifier... Done
Evaluating XGBoostClassifier...
Performance of XGBoostClassifier: {'accuracy': 0.9643429487179487, 'f1_score': np.float64(0.8886107634543179), 'precision': np.float64(0.8333333333333334), 'recall': np.float64(0.9517426273458445)}

Best Model: XGBoostClassifier
Performance: {'accuracy': 0.9643429487179487, 'f1_score': np.float64(0.8886107634543179), 'precision': np.float64(0.8333333333333334), 'recall': np.float64(0.9517426273458445)}


In [None]:
# Display the Classification Evaluation metrics of each model
print(pd.DataFrame(model_performance).transpose())

                            accuracy  f1_score  precision    recall
LogisticRegression          0.822115  0.587744   0.449574  0.848525
SVC                         0.928886  0.780999   0.723429  0.848525
DecisionTreeClassifier      0.921875  0.741036   0.734211  0.747989
RandomForestClassifier      0.950721  0.845088   0.796912  0.899464
GradientBoostingClassifier  0.950321  0.848596   0.779148  0.931635
KNeighborsClassifier        0.770633  0.521121   0.378723  0.835121
GaussianNB                  0.684295  0.481920   0.319251  0.982574
MLPClassifier               0.962740  0.874663   0.879404  0.869973
XGBoostClassifier           0.964343  0.888611   0.833333  0.951743


In [None]:
# Display the Classification Report and Confusion Matrix for each model
from IPython.display import display

for model_name in classification_reports:
    print(f"\n🔹 Classification Report for {model_name}:")
    display(classification_reports[model_name])  # Display Classification Report
    
    print(f"\n🔹 Confusion Matrix for {model_name}:")
    display(confusion_matrices[model_name])  # Display Confusion Matrix



🔹 Classification Report for LogisticRegression:


Unnamed: 0,precision,recall,f1-score,support
0,0.968471,0.817475,0.88659,4246.0
1,0.449574,0.848525,0.587744,746.0
accuracy,0.822115,0.822115,0.822115,0.822115
macro avg,0.709022,0.833,0.737167,4992.0
weighted avg,0.890927,0.822115,0.841931,4992.0



🔹 Confusion Matrix for LogisticRegression:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,3471,775
Actual_1,113,633



🔹 Classification Report for SVC:


Unnamed: 0,precision,recall,f1-score,support
0,0.972553,0.943005,0.957551,4246.0
1,0.723429,0.848525,0.780999,746.0
accuracy,0.928886,0.928886,0.928886,0.928886
macro avg,0.847991,0.895765,0.869275,4992.0
weighted avg,0.935324,0.928886,0.931167,4992.0



🔹 Confusion Matrix for SVC:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,4004,242
Actual_1,113,633



🔹 Classification Report for DecisionTreeClassifier:


Unnamed: 0,precision,recall,f1-score,support
0,0.955577,0.952426,0.953999,4246.0
1,0.734211,0.747989,0.741036,746.0
accuracy,0.921875,0.921875,0.921875,0.921875
macro avg,0.844894,0.850208,0.847517,4992.0
weighted avg,0.922496,0.921875,0.922174,4992.0



🔹 Confusion Matrix for DecisionTreeClassifier:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,4044,202
Actual_1,188,558



🔹 Classification Report for RandomForestClassifier:


Unnamed: 0,precision,recall,f1-score,support
0,0.981928,0.959727,0.9707,4246.0
1,0.796912,0.899464,0.845088,746.0
accuracy,0.950721,0.950721,0.950721,0.950721
macro avg,0.88942,0.929595,0.907894,4992.0
weighted avg,0.954279,0.950721,0.951929,4992.0



🔹 Confusion Matrix for RandomForestClassifier:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,4075,171
Actual_1,75,671



🔹 Classification Report for GradientBoostingClassifier:


Unnamed: 0,precision,recall,f1-score,support
0,0.987561,0.953603,0.970285,4246.0
1,0.779148,0.931635,0.848596,746.0
accuracy,0.950321,0.950321,0.950321,0.950321
macro avg,0.883354,0.942619,0.909441,4992.0
weighted avg,0.956416,0.950321,0.9521,4992.0



🔹 Confusion Matrix for GradientBoostingClassifier:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,4049,197
Actual_1,51,695



🔹 Classification Report for KNeighborsClassifier:


Unnamed: 0,precision,recall,f1-score,support
0,0.963251,0.759303,0.849203,4246.0
1,0.378723,0.835121,0.521121,746.0
accuracy,0.770633,0.770633,0.770633,0.770633
macro avg,0.670987,0.797212,0.685162,4992.0
weighted avg,0.875899,0.770633,0.800175,4992.0



🔹 Confusion Matrix for KNeighborsClassifier:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,3224,1022
Actual_1,123,623



🔹 Classification Report for GaussianNB:


Unnamed: 0,precision,recall,f1-score,support
0,0.995178,0.631889,0.772976,4246.0
1,0.319251,0.982574,0.48192,746.0
accuracy,0.684295,0.684295,0.684295,0.684295
macro avg,0.657214,0.807231,0.627448,4992.0
weighted avg,0.894168,0.684295,0.729481,4992.0



🔹 Confusion Matrix for GaussianNB:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,2683,1563
Actual_1,13,733



🔹 Classification Report for MLPClassifier:


Unnamed: 0,precision,recall,f1-score,support
0,0.977198,0.979039,0.978118,4246.0
1,0.879404,0.869973,0.874663,746.0
accuracy,0.96274,0.96274,0.96274,0.96274
macro avg,0.928301,0.924506,0.92639,4992.0
weighted avg,0.962584,0.96274,0.962657,4992.0



🔹 Confusion Matrix for MLPClassifier:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,4157,89
Actual_1,97,649



🔹 Classification Report for XGBoostClassifier:


Unnamed: 0,precision,recall,f1-score,support
0,0.991304,0.966557,0.978774,4246.0
1,0.833333,0.951743,0.888611,746.0
accuracy,0.964343,0.964343,0.964343,0.964343
macro avg,0.912319,0.95915,0.933692,4992.0
weighted avg,0.967697,0.964343,0.9653,4992.0



🔹 Confusion Matrix for XGBoostClassifier:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,4104,142
Actual_1,36,710


In [None]:
# Finding the best model based on F1-Score
model_performance_df = pd.DataFrame(model_performance).transpose()

# Sort models by F1-score and get the top 3
top_3_models = model_performance_df.sort_values(by="f1_score", ascending=False).head(3)

# Print the top 3 models
print("Top 3 Models Based on F1-score:")
print(top_3_models)

# Extract top 3 model names
top_3_model_names = top_3_models.index.tolist()
print("\nTop 3 Models for Hyperparameter Tuning:", top_3_model_names)

Top 3 Models Based on F1-score:
                            accuracy  f1_score  precision    recall
XGBoostClassifier           0.964343  0.888611   0.833333  0.951743
MLPClassifier               0.962740  0.874663   0.879404  0.869973
GradientBoostingClassifier  0.950321  0.848596   0.779148  0.931635

Top 3 Models for Hyperparameter Tuning: ['XGBoostClassifier', 'MLPClassifier', 'GradientBoostingClassifier']


In [None]:
# Hyperparameter Tuning for the best model using RandomizedSearchCV

# Import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

scorer = make_scorer(f1_score)
# Define the hyperparameter grid
xgb_params = {
    'n_estimators': [200, 250, 300, 350, 400, 450, 500, 550, 600],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.09, 0.1],
    'max_depth': [3, 5, 7, 10, 11],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 2, 3],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.5, 1.0, 1.5, 2.0],
    'reg_lambda': [0, 0.5, 1.0, 1.5, 2.0]
}

# Initialize the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_params,
    scoring=scorer,
    cv=5,
    verbose=1,
    n_jobs=-1,
    n_iter=100,
    random_state=42
)

# Fit the model
random_search.fit(X_resampled, y_resampled)

# Get the best model
best_xgb_model = random_search.best_estimator_
print(f"Best parameters for XGBoost: {random_search.best_params_}")

# Predict on the test data
y_pred = best_xgb_model.predict(X_test)

# Generate the classification report
class_report = classification_report(y_test, y_pred, output_dict=True)
classification_report_df = pd.DataFrame(class_report).transpose()

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix,
                              index=[f"Actual_{i}" for i in range(len(conf_matrix))],
                              columns=[f"Predicted_{i}" for i in range(len(conf_matrix))])

# Model performance summary
model_performance = {
    "accuracy": random_search.best_score_,
    "f1_score": f1_score(y_test, y_pred),
    "precision": classification_report_df.iloc[0, 0],
    "recall": classification_report_df.iloc[1, 1]
}

# Convert model performance dictionary to DataFrame
model_performance_df = pd.DataFrame([model_performance])

# Output the results
print("Classification Report:")
print(classification_report_df)
print("\nConfusion Matrix:")
print(conf_matrix_df)
print("\nModel Performance:")
print(model_performance_df)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBoost: {'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alpha': 1.0, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 11, 'learning_rate': 0.03, 'gamma': 0.1, 'colsample_bytree': 0.8}
Classification Report:
              precision    recall  f1-score      support
0              0.991827  0.971738  0.981680  4246.000000
1              0.855769  0.954424  0.902408   746.000000
accuracy       0.969151  0.969151  0.969151     0.969151
macro avg      0.923798  0.963081  0.942044  4992.000000
weighted avg   0.971495  0.969151  0.969833  4992.000000

Confusion Matrix:
          Predicted_0  Predicted_1
Actual_0         4126          120
Actual_1           34          712

Model Performance:
   accuracy  f1_score  precision    recall
0  0.972533  0.902408   0.991827  0.954424


In [157]:
# Save the best model in 'best_model\best_xgb_model.pkl' file path
sub_folder = "best_model"
model_filename = "best_xgb_model.pkl"

os.makedirs(sub_folder, exist_ok=True)
file_path = os.path.join(sub_folder, model_filename)
with open(file_path, 'wb') as file:
    pickle.dump(best_xgb_model, file)

file_path 


'best_model\\best_xgb_model.pkl'

In [158]:
# Make predictions (0 or 1) using the best XGBoost model
holdout_features = df_holdout.drop(columns=['company_id'])
holdout_predictions = best_xgb_model.predict(holdout_features)

# Create the submission DataFrame
submission = pd.DataFrame({
    'company_id': df_holdout['company_id'],
    'is_hot_lead': holdout_predictions
})

# Save the submission file
submission.to_csv('submission/submission.csv', index=False)
print("Holdout predictions saved successfully!")


Holdout predictions saved successfully!
