Importando libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.svm import SVC

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder, PolynomialFeatures, PowerTransformer, MinMaxScaler

from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay

from sklearn.neural_network import MLPClassifier

from sklearn.decomposition import PCA

from utils import *

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from skopt.space import Real, Integer, Categorical


Carregando datasets

In [2]:
train = pd.read_csv('../data/train.csv')
prod  = pd.read_csv('../data/production.csv')

train.head()

Unnamed: 0,cgpa,internships,projects,workshops_certifications,aptitude_test_score,soft_skills_rating,extra_curricular_activities,placement_training,ssc_marks,hsc_marks,n_job_applications,personality_type,placement_status,student_id
0,8.4,2,2,0,74,4.4,Yes,No,79,85,4,2,0,4085
1,7.6,1,2,1,75,4.3,Yes,Yes,80,67,7,4,1,2992
2,7.6,1,2,1,61,3.2,No,No,61,57,7,2,0,8623
3,7.5,1,2,0,78,4.4,Yes,Yes,61,71,7,2,0,8811
4,8.45,2,3,2,82,4.89,Yes,Yes,75,93,23,3,1,10072


Separando features de target e removendo id dos estudantes

In [3]:
X = train.drop(['student_id','placement_status'],axis = 1)
y = train['placement_status']

#X = featureExtractor(X)


X['placement_training'] = X['placement_training'].apply(lambda x: 1 if x =='Yes' else 0 )
X['extra_curricular_activities'] = X['extra_curricular_activities'].apply(lambda x: 1 if x =='Yes' else 0)

display(X.head())
display(y.head())

Unnamed: 0,cgpa,internships,projects,workshops_certifications,aptitude_test_score,soft_skills_rating,extra_curricular_activities,placement_training,ssc_marks,hsc_marks,n_job_applications,personality_type
0,8.4,2,2,0,74,4.4,1,0,79,85,4,2
1,7.6,1,2,1,75,4.3,1,1,80,67,7,4
2,7.6,1,2,1,61,3.2,0,0,61,57,7,2
3,7.5,1,2,0,78,4.4,1,1,61,71,7,2
4,8.45,2,3,2,82,4.89,1,1,75,93,23,3


0    0
1    1
2    0
3    0
4    1
Name: placement_status, dtype: int64

Separando em treino e teste

In [4]:
X_train,X_test, y_train,y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = y)

print(f'train --> features shape: {X_train.shape}, target shape: {y_train.shape}')
print(f'test  --> features shape: {X_test.shape}, target shape: {y_test.shape}')

train --> features shape: (7360, 12), target shape: (7360,)
test  --> features shape: (1840, 12), target shape: (1840,)


Separando variaveis categoricas das numericas

In [5]:
# para o columns transform
#categorical_cols = ['extra_curricular_activities', 'placement_training','personality_type']
categorical_cols = ['personality_type']
numeric_cols = []

for col in X.columns:
    
    if col not in categorical_cols:
        numeric_cols.append(col)


print('Features numericas:', numeric_cols)
print('Features categoricas: ', categorical_cols)

Features numericas: ['cgpa', 'internships', 'projects', 'workshops_certifications', 'aptitude_test_score', 'soft_skills_rating', 'extra_curricular_activities', 'placement_training', 'ssc_marks', 'hsc_marks', 'n_job_applications']
Features categoricas:  ['personality_type']


Definindo modelos

In [6]:
models = {
    'LogisticRegression': LogisticRegression(class_weight='balanced', C= 1),
    #'Knn': KNeighborsClassifier(),
    #'DecisionTreeClassifier': DecisionTreeClassifier(class_weight='balanced'),
    'RandomForestClassifier':  RandomForestClassifier(n_estimators=200,class_weight='balanced',random_state = 0),
    'XGBClassifier': XGBClassifier(n_estimators = 800, learning_rate = 1e-2, random_state = 42),
    'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42,activation='relu', hidden_layer_sizes=(4,4), n_iter_no_change=100,max_iter=100000000, early_stopping=False)
    }

Definindo pipeline de pré processamento

In [7]:
category_processing = Pipeline(steps=[('encoder',OneHotEncoder())])

numeric_processing = Pipeline(steps=[('scaler',StandardScaler())])


preprocessing = ColumnTransformer([
                                    ('category_preprocessing',category_processing,categorical_cols),
                                    ('numeric_preprocessing',numeric_processing,numeric_cols)],
                                    remainder='passthrough'
                                )


In [8]:
results = []
for i, (name, model) in enumerate(models.items()):

    print(f'Métricas para {name}')
    clf = Pipeline(steps=[('preprocessing', preprocessing), ('model', model)])
    metrics = validacaoCruzada(X_train,y_train,clf, 5)
    results.append({'model': name, **metrics})  

pd.DataFrame(results).sort_values(by='logloss',ascending = True)

Métricas para LogisticRegression
Métricas para RandomForestClassifier
Métricas para XGBClassifier
Métricas para ExtraTreesClassifier
Métricas para MLP


Unnamed: 0,model,logloss,f1,precision,recall,accuracy
2,XGBClassifier,0.050237,0.970441,0.968107,0.972838,0.975136
4,MLP,0.054648,0.97013,0.967809,0.972487,0.974864
1,RandomForestClassifier,0.060986,0.967129,0.978509,0.956134,0.97269
3,ExtraTreesClassifier,0.066903,0.966537,0.979046,0.95447,0.972283
0,LogisticRegression,0.069042,0.9612,0.952287,0.970311,0.96712


In [9]:
clf = Pipeline(steps=[('preprocessing', preprocessing), ('model', XGBClassifier(n_estimators = 800, learning_rate = 1e-2, random_state = 42))])
avaliacao_sem_otimizacao = avaliarModelo(X_train, X_test, y_train, y_test,clf)
avaliacao_sem_otimizacao[1]

0.05376041758500899

In [10]:
clf = Pipeline(steps=[('preprocessing', preprocessing), ('model', XGBClassifier(random_state=42))])


params = {
    # --- Controle Global do Modelo ---
    'model__learning_rate': Real(0.001, 5, prior='log-uniform'),  # Prioridade em valores baixos
    'model__n_estimators': Integer(50, 1200),  # De árvores mínimas a florestas gigantes
    
    # --- Complexidade das Árvores ---
    'model__max_depth': Integer(2, 100),  
    'model__max_leaves': Integer(0, 100),  # Só ativo se grow_policy='lossguide'
    'model__grow_policy': Categorical(['depthwise', 'lossguide']),  # Estratégia de crescimento
    'model__min_child_weight': Real(1e-3, 10, prior='log-uniform'),  # Permite splits mínimos
    'model__gamma': Real(0, 5, transform='identity'),  # Regularização de splits
    
    # --- Amostragem de Dados/Features ---
    'model__subsample': Real(0.4, 1.0),  # Redução agressiva para datasets pequenos
    'model__colsample_bytree': Real(0.3, 1.0),
    'model__colsample_bylevel': Real(0.5, 1.0),  # Features por nível de árvore
    'model__colsample_bynode': Real(0.5, 1.0),  # Features por nó
    
    # --- Regularização Avançada ---
    'model__reg_alpha': Real(1e-3, 100, prior='log-uniform'),  # L1 
    'model__reg_lambda': Real(1e-3, 100, prior='log-uniform'),  # L2
    'model__max_delta_step': Integer(0, 10),  # Útil para classes desbalanceadas
    
    # --- Otimização para GPU (se disponível) ---
    'model__tree_method': Categorical(['hist', 'gpu_hist']),  # Escolha automática
    'model__predictor': Categorical(['auto', 'gpu_predictor']),

}


clf_optimized , best_score = otimizarModelo(X_train, y_train, clf, params, n_iter = 20)

print(f'best_score after fine tunning: {best_score}')


avaliacao_otimizada = avaliarModelo(X_train, X_test, y_train, y_test,clf_optimized)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

In [None]:
avaliacao_sem_otimizacao[1],avaliacao_otimizada[1]

(0.05906293676921654, 0.05749430886250974)