In [1]:
import pandas as pd
import numpy as np

In [None]:
print('Loading dataframe...')
df = pd.read_csv('data/TOI_2025.10.04_05.29.58.csv', comment='#', on_bad_lines='skip')
df=df.drop(columns=['toi_created','rowupdate'])
print('Dataframe loaded successfully!', end='\n\n')

print('Dataframe shape:', df.shape, end='\n\n')

print('Dataframe preview:')
display(df.head())
display(df.info()) # Summary of the dataframe
display(df.describe()) # Statistical summary of numerical columns
display(df.isnull().sum()) # Count of missing values per column



In [None]:
df.dropna(inplace=True)
print('Dataframe shape after dropping missing values:', df.shape, end='\n\n')

# SVM classification com scikit-learn
Vamos treinar uma SVM para prever `tfopwg_disp` (disposição TFOPWG). O pipeline a seguir inclui:
- imputação de valores faltantes (mediana),
- padronização (StandardScaler),
- SVC (com busca em grade para hiperparâmetros).

In [None]:
# Imports para modelagem
%pip install matplotlib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib
import numpy as np
print('Imports carregados')

In [None]:
df = df.iloc[:3000].copy()

df['tfopwg_disp'] = df['tfopwg_disp'].astype('category')
print('Contagem por classe (tfopwg_disp):')
display(df['tfopwg_disp'].value_counts())
# Manter apenas linhas com target não nulo
df = df[df['tfopwg_disp'].notna()]
print('Shape após filtrar target:', df.shape)

In [None]:
# Selecionar features numéricas (exceto colunas temporais/irrelevantes)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Remover coluna de tempo do trânsito (se presente) porque é um timestamp
if 'pl_tranmid' in numeric_cols:
    numeric_cols.remove('pl_tranmid')
# Preparar X e y
X = df[numeric_cols].copy()
y = df['tfopwg_disp'].copy()
print('Features usadas (numéricas):', numeric_cols)
print('Missing values por feature:')
display(X.isnull().sum())

In [None]:
# Split estratificado e pipeline com SVM
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('svc', SVC(class_weight='balanced'))
])

param_grid = {
    'svc__kernel': ['rbf', 'linear', 'polynomial', 'sigmoid'],
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'svc__degree': [2, 3, 4],
    'svc__coef0': [0.0, 0.1, 0.5, 1.0]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='f1_macro')
print('Iniciando GridSearchCV (pode demorar) ...')
grid.fit(X_train, y_train)
print('Melhores parâmetros:', grid.best_params_)
print('Melhor CV score:', grid.best_score_)

In [None]:
# Avaliação no conjunto de teste
best = grid.best_estimator_
y_pred = best.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=best.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best.classes_)
fig, ax = plt.subplots(figsize=(8,6))
disp.plot(ax=ax, cmap='Blues')
plt.title('Matriz de confusão')
plt.show()

In [None]:
# Salvar modelo para uso futuro
joblib.dump(grid.best_estimator_, 'svm_tfopwg_disp_pipeline.joblib')
print('Modelo salvo em svm_tfopwg_disp_pipeline.joblib')