In [9]:
import pandas as pd
import numpy as np
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=ConvergenceWarning)


# Load the data


In [10]:
df = pd.read_csv("recruitment_data 2.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1500 non-null   int64  
 1   Gender               1500 non-null   int64  
 2   EducationLevel       1500 non-null   int64  
 3   ExperienceYears      1500 non-null   int64  
 4   PreviousCompanies    1500 non-null   int64  
 5   DistanceFromCompany  1500 non-null   float64
 6   InterviewScore       1500 non-null   int64  
 7   SkillScore           1500 non-null   int64  
 8   PersonalityScore     1500 non-null   int64  
 9   RecruitmentStrategy  1500 non-null   int64  
 10  HiringDecision       1500 non-null   int64  
dtypes: float64(1), int64(10)
memory usage: 129.0 KB
None


# Prepare data for modeling



In [11]:
import random
import numpy as np
import tensorflow as tf

SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
x = df.drop(columns=['HiringDecision'], axis=1)
y= df["HiringDecision"]

print(x.head())
print(y.head())

   Age  Gender  EducationLevel  ExperienceYears  PreviousCompanies  \
0   26       1               2                0                  3   
1   39       1               4               12                  3   
2   48       0               2                3                  2   
3   34       1               2                5                  2   
4   30       0               1                6                  1   

   DistanceFromCompany  InterviewScore  SkillScore  PersonalityScore  \
0            26.783828              48          78                91   
1            25.862694              35          68                80   
2             9.920805              20          67                13   
3             6.407751              36          27                70   
4            43.105343              23          52                85   

   RecruitmentStrategy  
0                    1  
1                    2  
2                    2  
3                    3  
4                    

# Split the data into training and testing sets


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=123)
print(y_train.value_counts())
print(y_test.value_counts())
x_train

HiringDecision
0    714
1    336
Name: count, dtype: int64
HiringDecision
0    321
1    129
Name: count, dtype: int64


Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy
891,49,1,2,6,1,23.075196,83,53,62,3
1385,31,0,1,0,3,3.684854,21,35,59,2
1186,48,1,2,3,1,37.152856,92,94,46,2
489,41,0,3,8,5,50.195318,76,49,30,1
1236,36,1,2,12,5,26.158958,20,87,19,2
...,...,...,...,...,...,...,...,...,...,...
1041,23,0,2,1,3,14.708202,29,86,37,2
1122,32,0,2,15,4,25.677183,24,95,47,2
1346,49,1,2,10,3,34.240900,91,19,32,2
1406,47,0,2,14,4,23.018711,43,21,58,1


# Feature selection using SelectKBest and model evaluation using Logistic Regression with GridSearchCV for imbalnce.

In [13]:

k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_dist_log_reg = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['saga', 'liblinear', 'newton-cg', 'lbfgs', 'sag'],
    'max_iter': [100, 200, 300, 500, 1000, 2000, 5000, 10000, 20000]
}

best_accuracy = 0
best_k = None
best_model = None

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(x_train, y_train)
    X_test_selected = selector.transform(x_test)

    selected_features = x.columns[selector.get_support()].tolist()

    log_reg = LogisticRegression()
    grid_search = GridSearchCV(log_reg, param_dist_log_reg, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_selected, y_train)

    y_pred = grid_search.best_estimator_.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k
        best_model = grid_search.best_estimator_

# Calculate and display metrics for the best model
best_selector = SelectKBest(score_func=f_classif, k=best_k)
X_train_best_selected = best_selector.fit_transform(x_train, y_train)
X_test_best_selected = best_selector.transform(x_test)

y_pred_best = best_model.predict(X_test_best_selected)
precision_best = precision_score(y_test, y_pred_best, average='macro')
recall_best = recall_score(y_test, y_pred_best, average='macro')
f1_best = f1_score(y_test, y_pred_best, average='macro')

print(f"\nBest model with k={best_k}")
print(f"Highest Accuracy: {best_accuracy}")
print(f"Precision: {precision_best}")
print(f"Recall: {recall_best}")
print(f"F1-Score: {f1_best}")

Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.8333333333333334
Best parameters for k=1: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Selected Features for k=2: ['EducationLevel', 'RecruitmentStrategy']
Accuracy for k=2: 0.8222222222222222
Best parameters for k=2: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=3: ['EducationLevel', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.84
Best parameters for k=3: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=4: ['EducationLevel', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.8511111111111112
Best parameters for k=4: {'C': 0.1, 'max_iter': 300, 'penalty': 'l2', 'solver': 'saga'}




Selected Features for k=5: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=5: 0.8577777777777778
Best parameters for k=5: {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=6: ['EducationLevel', 'ExperienceYears', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=6: 0.8844444444444445
Best parameters for k=6: {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Features for k=7: ['EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=7: 0.8822222222222222
Best parameters for k=7: {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Features for k=8: ['EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=8: 0.8844444444444445
Best parameters for k=8: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Features for k=9: ['Age', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=9: 0.8711111111111111
Best parameters for k=9: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Features for k=10: ['Age', 'Gender', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=10: 0.8711111111111111
Best parameters for k=10: {'C': 0.1, 'max_iter': 300, 'penalty': 'l2', 'solver': 'sag'}

Best model with k=6
Highest Accuracy: 0.8844444444444445
Precision: 0.8621615573953905
Recall: 0.8517713540534666
F1-Score: 0.8567019400352733


# Oversampled

In [14]:
smote = SMOTE(random_state=123)
x_sm, y_sm = smote.fit_resample(x_train, y_train)
y_sm.value_counts()

HiringDecision
1    714
0    714
Name: count, dtype: int64

# Feature selection using SelectKBest and model evaluation using Logistic Regression with GridSearchCV for Oversampled.

In [15]:

k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
accuracies = []

param_dist_log_reg = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['saga', 'liblinear'] + (['newton-cg', 'lbfgs', 'sag'] if 'penalty' == 'l2' else []),
    'max_iter': [100, 200, 300, 500, 1000, 2000, 5000, 10000, 20000]
}

best_accuracy = 0
best_k = None
best_model = None

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(x_sm, y_sm)
    X_test_selected = selector.transform(x_test)

    selected_features = x.columns[selector.get_support()].tolist()

    log_reg = LogisticRegression()
    grid_search = GridSearchCV(log_reg, param_dist_log_reg, cv=5, n_jobs=-1, verbose=0, scoring='accuracy')
    grid_search.fit(X_train_selected, y_sm)

    y_pred = grid_search.best_estimator_.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k
        best_model = grid_search.best_estimator_

# Calculate and display metrics for the best model
best_selector = SelectKBest(score_func=f_classif, k=best_k)
X_train_best_selected = best_selector.fit_transform(x_sm, y_sm)
X_test_best_selected = best_selector.transform(x_test)

y_pred_best = best_model.predict(X_test_best_selected)
precision_best = precision_score(y_test, y_pred_best, average='macro')
recall_best = recall_score(y_test, y_pred_best, average='macro')
f1_best = f1_score(y_test, y_pred_best, average='macro')

print(f"\nBest model with k={best_k}")
print(f"Highest Accuracy: {best_accuracy}")
print(f"Precision: {precision_best}")
print(f"Recall: {recall_best}")
print(f"F1-Score: {f1_best}")
print("Accuracies for different k values:", accuracies)

Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.8333333333333334
Best parameters for k=1: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=2: ['SkillScore', 'RecruitmentStrategy']
Accuracy for k=2: 0.8088888888888889
Best parameters for k=2: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=3: ['SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.8244444444444444
Best parameters for k=3: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=4: ['EducationLevel', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.8244444444444444
Best parameters for k=4: {'C': 100, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=5: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=5: 0.8022222222222222
Best parameters for k=5: {'C': 0.1, 'max_iter': 200, 'penalty': 'l2', 'solver': 'saga'}




Selected Features for k=6: ['EducationLevel', 'ExperienceYears', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=6: 0.8266666666666667
Best parameters for k=6: {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=7: ['Gender', 'EducationLevel', 'ExperienceYears', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=7: 0.8333333333333334
Best parameters for k=7: {'C': 10, 'max_iter': 2000, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=8: ['Gender', 'EducationLevel', 'ExperienceYears', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=8: 0.8333333333333334
Best parameters for k=8: {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=9: ['Gender', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=9: 0.8333333333333334
Best parameters for k=9: {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=10: ['Age', 'Gender', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=10: 0.8333333333333334
Best parameters for k=10: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

Best model with k=1
Highest Accuracy: 0.8333333333333334
Precision: 0.796957671957672
Recall: 0.7927624429471853
F1-Score: 0.7948016415868673
Accuracies for different k values: [0.8333333333333334, 0.8088888888888889, 0.8244444444444444, 0.8244444444444444, 0.8022222222222222, 0.8266666666666667, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334]


# undersampling

In [16]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter# RandomUnderSampler for undersampling the majority class
rus = RandomUnderSampler(random_state=123)  # Reduces the majority class to match the minority

# Now, apply RandomUnderSampler to the data oversampled by SMOTE
x_resampled, y_resampled = rus.fit_resample(x_train, y_train)

# Print class distribution after undersampling
print("After Undersampling:", Counter(y_resampled))  # Should remain balanced as SMOTE had already balanced it



After Undersampling: Counter({0: 336, 1: 336})


# # Feature selection using SelectKBest and model evaluation using Logistic Regression with GridSearchCV for Undersampling.

In [17]:
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
accuracies = []

param_dist_log_reg = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['saga', 'liblinear'] + (['newton-cg', 'lbfgs', 'sag'] if 'penalty' == 'l2' else []),
    'max_iter': [100, 200, 300, 500, 1000, 2000, 5000, 10000, 20000]
}

best_accuracy = 0
best_k = None
best_model = None

for k in k_values:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(x_resampled, y_resampled)
    X_test_selected = selector.transform(x_test)

    selected_features = x.columns[selector.get_support()].tolist()

    log_reg = LogisticRegression()
    grid_search = GridSearchCV(log_reg, param_dist_log_reg, cv=5, n_jobs=-1, verbose=0, scoring='accuracy')
    grid_search.fit(X_train_selected, y_resampled)

    y_pred = grid_search.best_estimator_.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    print(f"Selected Features for k={k}: {selected_features}")
    print(f"Accuracy for k={k}: {accuracy}")
    print(f"Best parameters for k={k}: {grid_search.best_params_}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k
        best_model = grid_search.best_estimator_

# Calculate and display metrics for the best model
best_selector = SelectKBest(score_func=f_classif, k=best_k)
X_train_best_selected = best_selector.fit_transform(x_resampled, y_resampled)
X_test_best_selected = best_selector.transform(x_test)

y_pred_best = best_model.predict(X_test_best_selected)
precision_best = precision_score(y_test, y_pred_best, average='macro')
recall_best = recall_score(y_test, y_pred_best, average='macro')
f1_best = f1_score(y_test, y_pred_best, average='macro')

print(f"\nBest model with k={best_k}")
print(f"Highest Accuracy: {best_accuracy}")
print(f"Precision: {precision_best}")
print(f"Recall: {recall_best}")
print(f"F1-Score: {f1_best}")
print("Accuracies for different k values:", accuracies)

Selected Features for k=1: ['RecruitmentStrategy']
Accuracy for k=1: 0.8333333333333334
Best parameters for k=1: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Selected Features for k=2: ['EducationLevel', 'RecruitmentStrategy']
Accuracy for k=2: 0.82
Best parameters for k=2: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=3: ['EducationLevel', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=3: 0.8088888888888889
Best parameters for k=3: {'C': 100, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=4: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'RecruitmentStrategy']
Accuracy for k=4: 0.7844444444444445
Best parameters for k=4: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=5: ['EducationLevel', 'ExperienceYears', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=5: 0.7866666666666666
Best parameters for k=5: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=6: ['EducationLevel', 'ExperienceYears', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=6: 0.8088888888888889
Best parameters for k=6: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




Selected Features for k=7: ['EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=7: 0.8022222222222222
Best parameters for k=7: {'C': 10, 'max_iter': 2000, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=8: ['Gender', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=8: 0.8022222222222222
Best parameters for k=8: {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}




Selected Features for k=9: ['Age', 'Gender', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=9: 0.7888888888888889
Best parameters for k=9: {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}




Selected Features for k=10: ['Age', 'Gender', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
Accuracy for k=10: 0.82
Best parameters for k=10: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

Best model with k=1
Highest Accuracy: 0.8333333333333334
Precision: 0.796957671957672
Recall: 0.7927624429471853
F1-Score: 0.7948016415868673
Accuracies for different k values: [0.8333333333333334, 0.82, 0.8088888888888889, 0.7844444444444445, 0.7866666666666666, 0.8088888888888889, 0.8022222222222222, 0.8022222222222222, 0.7888888888888889, 0.82]


In [19]:
from sklearn.pipeline import Pipeline
# Step 1: Create the pipeline
pipeline = Pipeline([
    ('feature_selection', SelectKBest(score_func=f_classif, k=8)),
    ('logistic_regression', LogisticRegression(penalty='l2', C=0.1, solver='liblinear', max_iter=100))
])


In [20]:

# Step 2: Fit the pipeline on the training data
pipeline.fit(x_train, y_train)

# Step 3: Make predictions and calculate accuracy
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")



Accuracy: 88.44%


In [21]:
# Save the pipeline to a file
import pickle


with open('LR_model_imb.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [22]:
import joblib
print(joblib.load("LR_model_imb.pkl"))
y_pred=joblib.load("LR_model_imb.pkl").predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print('Recall: ', recall_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('F1-Score: ', f1_score(y_test, y_pred))

Pipeline(steps=[('feature_selection', SelectKBest(k=8)),
                ('logistic_regression',
                 LogisticRegression(C=0.1, solver='liblinear'))])
Accuracy: 88.44%
Recall:  0.7674418604651163
Precision:  0.8181818181818182
F1-Score:  0.792
