# Análise da base de dados


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.9.0-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.7.7,>=0.7.5 (from visions[type_image_path]<0.7.7,>=0.7.5->ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata-profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata-profiling)
  Downloading dacite-1.8.1-py3-none-any.whl.metadata (15 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata-profiling)
  Downloading pywavelets-1.6

In [6]:
import pandas as pd
#from ydata_profiling import ProfileReport

In [7]:
df_recruitment = pd.read_csv('/content/drive/MyDrive/PROJETO_MD_DOCS/bases/recruitment_data.csv')
print(df_recruitment.head())

   Age  Gender  EducationLevel  ExperienceYears  PreviousCompanies  \
0   26       1               2                0                  3   
1   39       1               4               12                  3   
2   48       0               2                3                  2   
3   34       1               2                5                  2   
4   30       0               1                6                  1   

   DistanceFromCompany  InterviewScore  SkillScore  PersonalityScore  \
0            26.783828              48          78                91   
1            25.862694              35          68                80   
2             9.920805              20          67                13   
3             6.407751              36          27                70   
4            43.105343              23          52                85   

   RecruitmentStrategy  HiringDecision  
0                    1               1  
1                    2               1  
2                    2 

In [8]:
#Separando entrada e saída
Y = df_recruitment.pop("HiringDecision")
X = df_recruitment

In [None]:
recruitment_profile = ProfileReport(X)
recruitment_profile

Output hidden; open in https://colab.research.google.com to view.

# Pré-processamento dos dados

## Tratando dados categóricos

Education level e Recruitment Strategy

In [10]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# One-hot encoding para dados categóricos
categorical_features = ['EducationLevel', 'RecruitmentStrategy']
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_categorical_data = encoder.fit_transform(df_recruitment[categorical_features])

# Convertendo dados codificados para DataFrame
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_features))

# Remover colunas categóricas originais e concatenar as novas colunas codificadas
df_recruitment = df_recruitment.drop(columns=categorical_features)
df_recruitment = pd.concat([df_recruitment, encoded_categorical_df], axis=1)

print(df_recruitment.head())


   Age  Gender  ExperienceYears  PreviousCompanies  DistanceFromCompany  \
0   26       1                0                  3            26.783828   
1   39       1               12                  3            25.862694   
2   48       0                3                  2             9.920805   
3   34       1                5                  2             6.407751   
4   30       0                6                  1            43.105343   

   InterviewScore  SkillScore  PersonalityScore  EducationLevel_2  \
0              48          78                91               1.0   
1              35          68                80               0.0   
2              20          67                13               1.0   
3              36          27                70               1.0   
4              23          52                85               0.0   

   EducationLevel_3  EducationLevel_4  RecruitmentStrategy_2  \
0               0.0               0.0                    0.0   
1     



## Tratando dados numéricos (inteiro e contínuo)

Age, ExperienceYears, PreviousCompanies, DistanceFromCompany, InterviewScore, SkillScore e PersonalityScore

In [11]:
# Discretizar características contínuas e inteiras
continuous_features = ['Age', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore']

def create_bins(data, feature, num_bins=5):
    bins = np.linspace(data[feature].min(), data[feature].max(), num_bins + 1)
    labels = [f"{feature}_bin_{i}" for i in range(1, num_bins + 1)]
    return pd.cut(data[feature], bins=bins, labels=labels, include_lowest=True)

for feature in continuous_features:
    df_recruitment[feature] = create_bins(df_recruitment, feature)

# One-hot encoding para características discretizadas
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_continuous_data = encoder.fit_transform(df_recruitment[continuous_features])

# Convertendo dados codificados para DataFrame
encoded_continuous_df = pd.DataFrame(encoded_continuous_data, columns=encoder.get_feature_names_out(continuous_features))

# Remover colunas contínuas originais e concatenar as novas colunas codificadas
df_recruitment = df_recruitment.drop(columns=continuous_features)
df_recruitment = pd.concat([df_recruitment, encoded_continuous_df], axis=1)

print(df_recruitment.head())

   Gender  EducationLevel_2  EducationLevel_3  EducationLevel_4  \
0       1               1.0               0.0               0.0   
1       1               0.0               0.0               1.0   
2       0               1.0               0.0               0.0   
3       1               1.0               0.0               0.0   
4       0               0.0               0.0               0.0   

   RecruitmentStrategy_2  RecruitmentStrategy_3  Age_Age_bin_2  Age_Age_bin_3  \
0                    0.0                    0.0            0.0            0.0   
1                    1.0                    0.0            0.0            0.0   
2                    1.0                    0.0            0.0            0.0   
3                    0.0                    1.0            0.0            1.0   
4                    1.0                    0.0            1.0            0.0   

   Age_Age_bin_4  Age_Age_bin_5  ...  InterviewScore_InterviewScore_bin_4  \
0            0.0            0.0  



## Base de dados pré-processada

In [12]:
# Salvar a base de dados pré-processada
final_recruitment_data_path = '/content/drive/MyDrive/PROJETO_MD_DOCS/bases/final_recruitment_data.csv'
df_recruitment.to_csv(final_recruitment_data_path, index=False)

In [13]:
print(df_recruitment.head())

   Gender  EducationLevel_2  EducationLevel_3  EducationLevel_4  \
0       1               1.0               0.0               0.0   
1       1               0.0               0.0               1.0   
2       0               1.0               0.0               0.0   
3       1               1.0               0.0               0.0   
4       0               0.0               0.0               0.0   

   RecruitmentStrategy_2  RecruitmentStrategy_3  Age_Age_bin_2  Age_Age_bin_3  \
0                    0.0                    0.0            0.0            0.0   
1                    1.0                    0.0            0.0            0.0   
2                    1.0                    0.0            0.0            0.0   
3                    0.0                    1.0            0.0            1.0   
4                    1.0                    0.0            1.0            0.0   

   Age_Age_bin_4  Age_Age_bin_5  ...  InterviewScore_InterviewScore_bin_4  \
0            0.0            0.0  

In [14]:
class_counts = Y.value_counts()
print("Quantidade de padrões por classe:")
print(class_counts)

Quantidade de padrões por classe:
HiringDecision
0    1035
1     465
Name: count, dtype: int64


# Modelagem

## Separando dados em treino e teste

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [19]:
print('O dataset de treino possui {} candidatos e o de teste {} candidatos.'.format(X_train.shape[0], X_test.shape[0]))

O dataset de treino possui 1050 candidatos e o de teste 450 candidatos.


## Definindo métricas

In [25]:
from sklearn.metrics import confusion_matrix, classification_report

In [26]:
#Acurácia
def intervalo(results):
    mean = results.mean()
    dv = results.std()
    print('Acurácia média: {:.2f}%'.format(mean*100))
    print('Intervalo de acurácia: [{:.2f}% ~ {:.2f}%]'
           .format((mean - 2*dv)*100, (mean + 2*dv)*100))
#Precisão
def intervalo_prec(results):
    mean = results.mean()
    dv = results.std()
    print('Precisão média: {:.2f}%'.format(mean*100))
    print('Intervalo de Precisão: [{:.2f}% ~ {:.2f}%]'
          .format((mean - 2*dv)*100, (mean + 2*dv)*100))

#Recall
def intervalo_recall(results):
    mean = results.mean()
    dv = results.std()
    print('Recall médio: {:.2f}%'.format(mean*100))
    print('Intervalo de Recall: [{:.2f}% ~ {:.2f}%]'
          .format((mean - 2*dv)*100, (mean + 2*dv)*100))

#F1-Score
def intervalo_f1(results):
    mean = results.mean()
    dv = results.std()
    print('F1-Score médio: {:.2f}%'.format(mean*100))
    print('Intervalo de F1-Score: [{:.2f}% ~ {:.2f}%]'
          .format((mean - 2*dv)*100, (mean + 2*dv)*100))


## Treinamento



### Árvore de Decisão

In [175]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [162]:
from sklearn.tree import DecisionTreeClassifier
clfTree = DecisionTreeClassifier(random_state=42)

In [163]:
#Acurácia com validação cruzada padrão
results = cross_val_score(clfTree, X_train, y_train, cv = 5, scoring = 'accuracy')
intervalo(results)

Acurácia média: 86.95%
Intervalo de acurácia: [84.81% ~ 89.09%]


In [164]:
#Modificando a validação cruzada para ser proporcional às classes
cv = StratifiedKFold(n_splits = 5, random_state=42, shuffle = True)

In [165]:
#Acurácia com validação cruzada proporcional
results = cross_val_score(clfTree, X_train, y_train, cv = cv)
intervalo(results)

Acurácia média: 89.33%
Intervalo de acurácia: [86.11% ~ 92.55%]


Outras métricas

In [176]:
results = cross_val_score(clfTree, X_train, y_train, cv = cv, scoring = 'precision')
intervalo_prec(results)
print()
results = cross_val_score(clfTree, X_train, y_train, cv = cv, scoring = 'recall')
intervalo_recall(results)
print()
results = cross_val_score(clfTree, X_train, y_train, cv = cv, scoring = 'f1')
intervalo_f1(results)
print()
#Matriz de Confusão
y_pred = cross_val_predict(clfTree, X_train, y_train, cv = cv)
conf_matrix = confusion_matrix(y_train, y_pred)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)

Precisão média: 83.47%
Intervalo de Precisão: [74.26% ~ 92.68%]

Recall médio: 83.64%
Intervalo de Recall: [76.93% ~ 90.35%]

F1-Score médio: 83.42%
Intervalo de F1-Score: [79.04% ~ 87.79%]

MATRIZ DE CONFUSÃO:
 [[657  57]
 [ 55 281]]


### Regressão Logística

In [169]:
from sklearn.linear_model import LogisticRegression
clfLog = LogisticRegression(random_state=42)

In [177]:
import warnings
warnings.simplefilter("ignore")

results = cross_val_score(clfLog, X_train, y_train, cv = cv)
intervalo(results)
print()
results = cross_val_score(clfLog, X_train, y_train, cv = cv, scoring = 'precision')
intervalo_prec(results)
print()
results = cross_val_score(clfLog, X_train, y_train, cv = cv, scoring = 'recall')
intervalo_recall(results)
print()
results = cross_val_score(clfLog, X_train, y_train, cv = cv, scoring = 'f1')
intervalo_f1(results)
print()
#Matriz de Confusão
y_pred = cross_val_predict(clfLog, X_train, y_train, cv = cv)
conf_matrix = confusion_matrix(y_train, y_pred)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)

Acurácia média: 84.29%
Intervalo de acurácia: [81.88% ~ 86.70%]

Precisão média: 77.79%
Intervalo de Precisão: [73.24% ~ 82.35%]

Recall médio: 71.43%
Intervalo de Recall: [60.55% ~ 82.31%]

F1-Score médio: 74.32%
Intervalo de F1-Score: [68.88% ~ 79.77%]

MATRIZ DE CONFUSÃO:
 [[645  69]
 [ 96 240]]


### KNN

In [171]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [178]:
results = cross_val_score(knn, X_train, y_train, cv = cv)
intervalo(results)
print()
results = cross_val_score(knn, X_train, y_train, cv = cv, scoring = 'precision')
intervalo_prec(results)
print()
results = cross_val_score(knn, X_train, y_train, cv = cv, scoring = 'recall')
intervalo_recall(results)
print()
results = cross_val_score(knn, X_train, y_train, cv = cv, scoring = 'f1')
intervalo_f1(results)
print()
#Matriz de Confusão
y_pred = cross_val_predict(knn, X_train, y_train, cv = cv)
conf_matrix = confusion_matrix(y_train, y_pred)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)

Acurácia média: 67.90%
Intervalo de acurácia: [62.95% ~ 72.86%]

Precisão média: 50.10%
Intervalo de Precisão: [39.66% ~ 60.53%]

Recall médio: 36.61%
Intervalo de Recall: [28.41% ~ 44.81%]

F1-Score médio: 42.17%
Intervalo de F1-Score: [34.52% ~ 49.82%]

MATRIZ DE CONFUSÃO:
 [[590 124]
 [213 123]]


### Random Forest

In [174]:
from sklearn.ensemble import RandomForestClassifier
clfForest = RandomForestClassifier(random_state=42)

In [179]:
results = cross_val_score(clfForest, X_train, y_train, cv = cv)
intervalo(results)
print()
results = cross_val_score(clfForest, X_train, y_train, cv = cv, scoring = 'precision')
intervalo_prec(results)
print()
results = cross_val_score(clfForest, X_train, y_train, cv = cv, scoring = 'recall')
intervalo_recall(results)
print()
results = cross_val_score(clfForest, X_train, y_train, cv = cv, scoring = 'f1')
intervalo_f1(results)
print()
#Matriz de Confusão
y_pred = cross_val_predict(clfForest, X_train, y_train, cv = cv)
conf_matrix = confusion_matrix(y_train, y_pred)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)

Acurácia média: 91.71%
Intervalo de acurácia: [89.85% ~ 93.58%]

Precisão média: 90.51%
Intervalo de Precisão: [83.23% ~ 97.78%]

Recall médio: 83.02%
Intervalo de Recall: [77.71% ~ 88.34%]

F1-Score médio: 86.52%
Intervalo de F1-Score: [83.64% ~ 89.39%]

MATRIZ DE CONFUSÃO:
 [[684  30]
 [ 57 279]]


### XGBoost

In [180]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=42)

In [181]:
results = cross_val_score(xgb, X_train, y_train, cv = cv)
intervalo(results)
print()
results = cross_val_score(xgb, X_train, y_train, cv = cv, scoring = 'precision')
intervalo_prec(results)
print()
results = cross_val_score(xgb, X_train, y_train, cv = cv, scoring = 'recall')
intervalo_recall(results)
print()
results = cross_val_score(xgb, X_train, y_train, cv = cv, scoring = 'f1')
intervalo_f1(results)
print()
#Matriz de Confusão
y_pred = cross_val_predict(xgb, X_train, y_train, cv = cv)
conf_matrix = confusion_matrix(y_train, y_pred)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)

Acurácia média: 92.00%
Intervalo de acurácia: [90.15% ~ 93.85%]

Precisão média: 90.35%
Intervalo de Precisão: [82.86% ~ 97.85%]

Recall médio: 84.22%
Intervalo de Recall: [80.07% ~ 88.36%]

F1-Score médio: 87.09%
Intervalo de F1-Score: [84.62% ~ 89.57%]

MATRIZ DE CONFUSÃO:
 [[683  31]
 [ 53 283]]


## Teste

### Árvore de Decisão

In [182]:
final_clfTree = DecisionTreeClassifier(random_state=42)

In [183]:
final_clfTree.fit(X_train, y_train)
y_pred = final_clfTree.predict(X_train)
y_pred_test = final_clfTree.predict(X_test)

In [184]:
conf_matrix = confusion_matrix(y_test,y_pred_test)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)
print("\nRELATÓRIO DE CLASSIFICAÇÃO ÁRVORE DE DECISÃO:")
print(classification_report(y_test, y_pred_test, target_names=['Não aptos', 'Aptos']))

MATRIZ DE CONFUSÃO:
 [[278  43]
 [ 28 101]]

RELATÓRIO DE CLASSIFICAÇÃO ÁRVORE DE DECISÃO:
              precision    recall  f1-score   support

   Não aptos       0.91      0.87      0.89       321
       Aptos       0.70      0.78      0.74       129

    accuracy                           0.84       450
   macro avg       0.80      0.82      0.81       450
weighted avg       0.85      0.84      0.84       450



###Regressão Logística

In [185]:
final_clfLog = LogisticRegression(random_state=42)
final_clfLog.fit(X_train, y_train)
y_pred = final_clfLog.predict(X_train)
y_pred_test = final_clfLog.predict(X_test)

In [186]:
conf_matrix = confusion_matrix(y_test,y_pred_test)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)
print("\nRELATÓRIO DE CLASSIFICAÇÃO REGRESSÃO LOSÍSTICA:")
print(classification_report(y_test, y_pred_test, target_names=['Não aptos', 'Aptos']))

MATRIZ DE CONFUSÃO:
 [[297  24]
 [ 31  98]]

RELATÓRIO DE CLASSIFICAÇÃO REGRESSÃO LOSÍSTICA:
              precision    recall  f1-score   support

   Não aptos       0.91      0.93      0.92       321
       Aptos       0.80      0.76      0.78       129

    accuracy                           0.88       450
   macro avg       0.85      0.84      0.85       450
weighted avg       0.88      0.88      0.88       450



###KNN

In [187]:
final_knn = KNeighborsClassifier()
final_knn.fit(X_train, y_train)
y_pred = final_knn.predict(X_train)
y_pred_test = final_knn.predict(X_test)

In [188]:
conf_matrix = confusion_matrix(y_test,y_pred_test)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)
print("\nRELATÓRIO DE CLASSIFICAÇÃO KNN:")
print(classification_report(y_test, y_pred_test, target_names=['Não aptos', 'Aptos']))

MATRIZ DE CONFUSÃO:
 [[256  65]
 [ 79  50]]

RELATÓRIO DE CLASSIFICAÇÃO KNN:
              precision    recall  f1-score   support

   Não aptos       0.76      0.80      0.78       321
       Aptos       0.43      0.39      0.41       129

    accuracy                           0.68       450
   macro avg       0.60      0.59      0.60       450
weighted avg       0.67      0.68      0.67       450



###Random Forest

In [189]:
final_clfForest = RandomForestClassifier(random_state=42)
final_clfForest.fit(X_train, y_train)
y_pred = final_clfForest.predict(X_train)
y_pred_test = final_clfForest.predict(X_test)

In [190]:
conf_matrix = confusion_matrix(y_test,y_pred_test)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)
print("\nRELATÓRIO DE CLASSIFICAÇÃO KNN:")
print(classification_report(y_test, y_pred_test, target_names=['Não aptos', 'Aptos']))

MATRIZ DE CONFUSÃO:
 [[313   8]
 [ 25 104]]

RELATÓRIO DE CLASSIFICAÇÃO KNN:
              precision    recall  f1-score   support

   Não aptos       0.93      0.98      0.95       321
       Aptos       0.93      0.81      0.86       129

    accuracy                           0.93       450
   macro avg       0.93      0.89      0.91       450
weighted avg       0.93      0.93      0.93       450



###XGBoost

In [191]:
final_xgb = XGBClassifier(random_state=42)
final_xgb.fit(X_train, y_train)
y_pred = final_xgb.predict(X_train)
y_pred_test = final_xgb.predict(X_test)

In [192]:
conf_matrix = confusion_matrix(y_test,y_pred_test)
print("MATRIZ DE CONFUSÃO:\n",conf_matrix)
print("\nRELATÓRIO DE CLASSIFICAÇÃO KNN:")
print(classification_report(y_test, y_pred_test, target_names=['Não aptos', 'Aptos']))

MATRIZ DE CONFUSÃO:
 [[311  10]
 [ 22 107]]

RELATÓRIO DE CLASSIFICAÇÃO KNN:
              precision    recall  f1-score   support

   Não aptos       0.93      0.97      0.95       321
       Aptos       0.91      0.83      0.87       129

    accuracy                           0.93       450
   macro avg       0.92      0.90      0.91       450
weighted avg       0.93      0.93      0.93       450

