In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Carregar datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Ver dimensões
print("Train:", train.shape)
print("Test:", test.shape)

# Olhar primeiras linhas
print(train.head())

# Ver estatísticas gerais
print(train.describe(include="all"))
print(test.describe(include="all"))

Train: (646, 33)
Test: (277, 32)
    id  age_first_funding_year  age_last_funding_year  \
0  719                   10.42                  13.09   
1  429                    3.79                   3.79   
2  178                    0.71                   2.28   
3  197                    3.00                   5.00   
4  444                    0.66                   5.88   

   age_first_milestone_year  age_last_milestone_year  relationships  \
0                      8.98                    12.72              4   
1                       NaN                      NaN             21   
2                      1.95                     2.28              5   
3                      9.62                    10.39             16   
4                      6.21                     8.61             29   

   funding_rounds  funding_total_usd  milestones  is_CA  ...  is_consulting  \
0               3            4087500           3      1  ...              0   
1               1           45000000   

In [65]:
# ANÁLISE EXPLORATÓRIA INICIAL
# Verificar informações gerais dos dados
print("=== INFORMAÇÕES GERAIS ===")
print("Train info:")
print(train.info())
print("\nTest info:")
print(test.info())

# Verificar valores ausentes
print("\n=== VALORES AUSENTES ===")
print("Train - valores nulos:")
print(train.isnull().sum())
print("\nTest - valores nulos:")
print(test.isnull().sum())

# Verificar duplicatas
print("\n=== DUPLICATAS ===")
print(f"Train duplicatas: {train.duplicated().sum()}")
print(f"Test duplicatas: {test.duplicated().sum()}")


=== INFORMAÇÕES GERAIS ===
Train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646 entries, 0 to 645
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        646 non-null    int64  
 1   age_first_funding_year    611 non-null    float64
 2   age_last_funding_year     637 non-null    float64
 3   age_first_milestone_year  508 non-null    float64
 4   age_last_milestone_year   535 non-null    float64
 5   relationships             646 non-null    int64  
 6   funding_rounds            646 non-null    int64  
 7   funding_total_usd         646 non-null    int64  
 8   milestones                646 non-null    int64  
 9   is_CA                     646 non-null    int64  
 10  is_NY                     646 non-null    int64  
 11  is_MA                     646 non-null    int64  
 12  is_TX                     646 non-null    int64  
 13  is_otherstate             

In [66]:
categorial_cols = train.select_dtypes(include=['object']).columns
numerical_cols = train.select_dtypes(include=['number']).columns
print("\n=== COLUNAS CATEGÓRICAS ===")
print(categorial_cols) 
print("\n=== COLUNAS NUMÉRICAS ===")
print(numerical_cols)


=== COLUNAS CATEGÓRICAS ===
Index(['category_code'], dtype='object')

=== COLUNAS NUMÉRICAS ===
Index(['id', 'age_first_funding_year', 'age_last_funding_year',
       'age_first_milestone_year', 'age_last_milestone_year', 'relationships',
       'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY',
       'is_MA', 'is_TX', 'is_otherstate', 'is_software', 'is_web', 'is_mobile',
       'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce',
       'is_biotech', 'is_consulting', 'is_othercategory', 'has_VC',
       'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD',
       'avg_participants', 'labels'],
      dtype='object')


In [67]:
def tratar_valores_ausentes(df):
    for col in df.columns:
        if col in categorial_cols:
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)
    return df

train = tratar_valores_ausentes(train)
test = tratar_valores_ausentes(test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [68]:
def identificar_outliers_iqr(df):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outliers

for col in numerical_cols:
    outliers = identificar_outliers_iqr(train)
    print(f"Coluna {col} - Outliers: {len(outliers)}")

# Tratamento de outliers 

Coluna id - Outliers: 0
Coluna age_first_funding_year - Outliers: 20
Coluna age_last_funding_year - Outliers: 11
Coluna age_first_milestone_year - Outliers: 41
Coluna age_last_milestone_year - Outliers: 22
Coluna relationships - Outliers: 47
Coluna funding_rounds - Outliers: 10
Coluna funding_total_usd - Outliers: 50
Coluna milestones - Outliers: 0
Coluna is_CA - Outliers: 0
Coluna is_NY - Outliers: 71
Coluna is_MA - Outliers: 61
Coluna is_TX - Outliers: 24
Coluna is_otherstate - Outliers: 136
Coluna is_software - Outliers: 105
Coluna is_web - Outliers: 97
Coluna is_mobile - Outliers: 65
Coluna is_enterprise - Outliers: 53
Coluna is_advertising - Outliers: 45
Coluna is_gamesvideo - Outliers: 37
Coluna is_ecommerce - Outliers: 20
Coluna is_biotech - Outliers: 25
Coluna is_consulting - Outliers: 2
Coluna is_othercategory - Outliers: 0
Coluna has_VC - Outliers: 0
Coluna has_angel - Outliers: 0
Coluna has_roundA - Outliers: 0
Coluna has_roundB - Outliers: 0
Coluna has_roundC - Outliers: 15

In [69]:
#Colunas categóricas
from sklearn.preprocessing import OneHotEncoder

def processar_colunas_categoricas(df):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    df_encoded = pd.DataFrame(encoder.fit_transform(df[categorial_cols]))
    df_encoded.columns = encoder.get_feature_names_out(categorial_cols)
    df = df.drop(categorial_cols, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

train_encoded = processar_colunas_categoricas(train)
test_encoded = processar_colunas_categoricas(test)


In [70]:
# Normalização das colunas numéricas - VERSÃO MELHORADA
from sklearn.preprocessing import StandardScaler

def normalizar_colunas_numericas(df, excluir_colunas=None):
    """Normaliza todas as colunas numéricas exceto binárias e de ID"""
    df_normalized = df.copy()
    
    if excluir_colunas is None:
        excluir_colunas = []
    
    # Identificar colunas para normalizar
    colunas_para_normalizar = []
    
    for col in df_normalized.columns:
        # Pular se estiver na lista de exclusão
        if col in excluir_colunas:
            continue
            
        # Pular colunas de ID (que contenham 'id' no nome)
        if 'id' in col.lower():
            print(f"⚠️ Pulando coluna de ID: {col}")
            continue
            
        # Verificar se é coluna numérica
        if df_normalized[col].dtype in ['int64', 'float64', 'int32', 'float32']:
            # Verificar se é coluna binária (só tem valores 0 e 1)
            valores_unicos = df_normalized[col].dropna().unique()
            
            # Se só tem 2 valores únicos e são 0 e 1, é binária
            if len(valores_unicos) == 2 and set(valores_unicos) == {0, 1}:
                print(f"⚠️ Pulando coluna binária: {col}")
                continue
            
            # Se só tem 2 valores únicos mas não são 0 e 1, ainda pode ser categórica
            elif len(valores_unicos) == 2:
                print(f"⚠️ Pulando coluna com 2 valores únicos: {col} (valores: {valores_unicos})")
                continue
                
            # Se tem mais variação, normalizar
            else:
                colunas_para_normalizar.append(col)
    
    # Aplicar normalização
    if colunas_para_normalizar:
        scaler = StandardScaler()
        df_normalized[colunas_para_normalizar] = scaler.fit_transform(df_normalized[colunas_para_normalizar])
        print(f"✅ Normalizadas {len(colunas_para_normalizar)} colunas:")
        for col in colunas_para_normalizar:
            print(f"   - {col}")
    else:
        print("⚠️ Nenhuma coluna encontrada para normalizar")
    
    return df_normalized

# Aplicar normalização
# Para train: excluir a coluna target 'labels'
train_features = train_encoded.drop('labels', axis=1, errors='ignore')
train_normalized = normalizar_colunas_numericas(train_features, excluir_colunas=['labels'])

# Para test: sem exclusões específicas (não tem labels)
test_normalized = normalizar_colunas_numericas(test_encoded)

# Adicionar de volta a coluna 'labels' ao train
if 'labels' in train_encoded.columns:
    train_normalized['labels'] = train_encoded['labels']

print(f"\n📊 Shapes após normalização:")
print(f"   Train: {train_normalized.shape}")
print(f"   Test: {test_normalized.shape}")

# Verificar tipos de dados finais
print(f"\n📋 Tipos de dados após normalização:")
print("Train:")
print(train_normalized.dtypes.value_counts())
print("\nTest:")
print(test_normalized.dtypes.value_counts())

⚠️ Pulando coluna de ID: id
⚠️ Pulando coluna binária: is_CA
⚠️ Pulando coluna binária: is_NY
⚠️ Pulando coluna binária: is_MA
⚠️ Pulando coluna binária: is_TX
⚠️ Pulando coluna binária: is_otherstate
⚠️ Pulando coluna binária: is_software
⚠️ Pulando coluna binária: is_web
⚠️ Pulando coluna binária: is_mobile
⚠️ Pulando coluna binária: is_enterprise
⚠️ Pulando coluna binária: is_advertising
⚠️ Pulando coluna de ID: is_gamesvideo
⚠️ Pulando coluna binária: is_ecommerce
⚠️ Pulando coluna binária: is_biotech
⚠️ Pulando coluna binária: is_consulting
⚠️ Pulando coluna binária: is_othercategory
⚠️ Pulando coluna binária: has_VC
⚠️ Pulando coluna binária: has_angel
⚠️ Pulando coluna binária: has_roundA
⚠️ Pulando coluna binária: has_roundB
⚠️ Pulando coluna binária: has_roundC
⚠️ Pulando coluna binária: has_roundD
⚠️ Pulando coluna binária: category_code_advertising
⚠️ Pulando coluna binária: category_code_analytics
⚠️ Pulando coluna binária: category_code_automotive
⚠️ Pulando coluna binária

In [76]:
# MODELAGEM - MÚLTIPLOS ALGORITMOS
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

target_col = 'labels'
train_final = train_normalized.copy()
test_final = test_normalized.copy()

X_train = train_final.drop(target_col, axis=1)
y_train = train_final[target_col]
X_test = test_final.copy()

# 1. Dividir dados para validação
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

modelos = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

resultados = {}

for nome, modelo in modelos.items():
    print(f"\n=== Treinando modelo: {nome} ===")

    cv_scores = cross_val_score(modelo, X_train_split, y_train_split, cv=5, scoring='roc_auc')
    print(f"Cross-validation ROC AUC scores: {cv_scores}")

    #Treinar modelo
    modelo.fit(X_train_split, y_train_split)

    #Prever no conjunto de validação
    y_pred = modelo.predict(X_val)

    #Métricas para avaliar modelo
    resultados[nome] = {
        'modelo': modelo,
        'report': classification_report(y_val, y_pred, output_dict=True),
        'confusion_matrix': confusion_matrix(y_val, y_pred),
        'roc_auc': roc_auc_score(y_val, modelo.predict_proba(X_val)[:, 1])
    }
    print("Classification Report:")
    print(classification_report(y_val, y_pred))
    print(f"ROC AUC: {resultados[nome]['roc_auc']}")


=== Treinando modelo: Random Forest ===
Cross-validation ROC AUC scores: [0.75820975 0.85619703 0.86179957 0.76158405 0.68561422]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.59      0.66        68
           1       0.80      0.89      0.84       126

    accuracy                           0.78       194
   macro avg       0.77      0.74      0.75       194
weighted avg       0.78      0.78      0.78       194

ROC AUC: 0.8265056022408963

=== Treinando modelo: Gradient Boosting ===
Cross-validation ROC AUC scores: [0.75158898 0.82944915 0.82273707 0.75538793 0.6799569 ]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.57      0.65        68
           1       0.80      0.90      0.84       126

    accuracy                           0.78       194
   macro avg       0.77      0.74      0.75       194
weighted avg       0.78      0.78      0.78       194

ROC AU