In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Carregar datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Ver dimens√µes
print("Train:", train.shape)
print("Test:", test.shape)

# Olhar primeiras linhas
print(train.head())

# Ver estat√≠sticas gerais
print(train.describe(include="all"))
print(test.describe(include="all"))

Train: (646, 33)
Test: (277, 32)
    id  age_first_funding_year  age_last_funding_year  \
0  719                   10.42                  13.09   
1  429                    3.79                   3.79   
2  178                    0.71                   2.28   
3  197                    3.00                   5.00   
4  444                    0.66                   5.88   

   age_first_milestone_year  age_last_milestone_year  relationships  \
0                      8.98                    12.72              4   
1                       NaN                      NaN             21   
2                      1.95                     2.28              5   
3                      9.62                    10.39             16   
4                      6.21                     8.61             29   

   funding_rounds  funding_total_usd  milestones  is_CA  ...  is_consulting  \
0               3            4087500           3      1  ...              0   
1               1           45000000   

In [28]:
# AN√ÅLISE EXPLORAT√ìRIA INICIAL
# Verificar informa√ß√µes gerais dos dados
print("=== INFORMA√á√ïES GERAIS ===")
print("Train info:")
print(train.info())
print("\nTest info:")
print(test.info())

# Verificar valores ausentes
print("\n=== VALORES AUSENTES ===")
print("Train - valores nulos:")
print(train.isnull().sum())
print("\nTest - valores nulos:")
print(test.isnull().sum())

# Verificar duplicatas
print("\n=== DUPLICATAS ===")
print(f"Train duplicatas: {train.duplicated().sum()}")
print(f"Test duplicatas: {test.duplicated().sum()}")


=== INFORMA√á√ïES GERAIS ===
Train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646 entries, 0 to 645
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        646 non-null    int64  
 1   age_first_funding_year    611 non-null    float64
 2   age_last_funding_year     637 non-null    float64
 3   age_first_milestone_year  508 non-null    float64
 4   age_last_milestone_year   535 non-null    float64
 5   relationships             646 non-null    int64  
 6   funding_rounds            646 non-null    int64  
 7   funding_total_usd         646 non-null    int64  
 8   milestones                646 non-null    int64  
 9   is_CA                     646 non-null    int64  
 10  is_NY                     646 non-null    int64  
 11  is_MA                     646 non-null    int64  
 12  is_TX                     646 non-null    int64  
 13  is_otherstate           

In [29]:
categorial_cols = train.select_dtypes(include=['object']).columns
numerical_cols = train.select_dtypes(include=['number']).columns
print("\n=== COLUNAS CATEG√ìRICAS ===")
print(categorial_cols) 
print("\n=== COLUNAS NUM√âRICAS ===")
print(numerical_cols)


=== COLUNAS CATEG√ìRICAS ===
Index(['category_code'], dtype='object')

=== COLUNAS NUM√âRICAS ===
Index(['id', 'age_first_funding_year', 'age_last_funding_year',
       'age_first_milestone_year', 'age_last_milestone_year', 'relationships',
       'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY',
       'is_MA', 'is_TX', 'is_otherstate', 'is_software', 'is_web', 'is_mobile',
       'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce',
       'is_biotech', 'is_consulting', 'is_othercategory', 'has_VC',
       'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD',
       'avg_participants', 'labels'],
      dtype='object')


In [30]:
def tratar_valores_ausentes(df):
    for col in df.columns:
        if col in categorial_cols:
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)
    return df

train = tratar_valores_ausentes(train)
test = tratar_valores_ausentes(test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [31]:
def identificar_outliers_iqr(df):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outliers

for col in numerical_cols:
    outliers = identificar_outliers_iqr(train)
    print(f"Coluna {col} - Outliers: {len(outliers)}")

'''
# Tratamento de outliers (truncamento)
def tratar_outliers_truncamento(df):
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    return df
'''

Coluna id - Outliers: 0
Coluna age_first_funding_year - Outliers: 20
Coluna age_last_funding_year - Outliers: 11
Coluna age_first_milestone_year - Outliers: 41
Coluna age_last_milestone_year - Outliers: 22
Coluna relationships - Outliers: 47
Coluna funding_rounds - Outliers: 10
Coluna funding_total_usd - Outliers: 50
Coluna milestones - Outliers: 0
Coluna is_CA - Outliers: 0
Coluna is_NY - Outliers: 71
Coluna is_MA - Outliers: 61
Coluna is_TX - Outliers: 24
Coluna is_otherstate - Outliers: 136
Coluna is_software - Outliers: 105
Coluna is_web - Outliers: 97
Coluna is_mobile - Outliers: 65
Coluna is_enterprise - Outliers: 53
Coluna is_advertising - Outliers: 45
Coluna is_gamesvideo - Outliers: 37
Coluna is_ecommerce - Outliers: 20
Coluna is_biotech - Outliers: 25
Coluna is_consulting - Outliers: 2
Coluna is_othercategory - Outliers: 0
Coluna has_VC - Outliers: 0
Coluna has_angel - Outliers: 0
Coluna has_roundA - Outliers: 0
Coluna has_roundB - Outliers: 0
Coluna has_roundC - Outliers: 15

'\n# Tratamento de outliers (truncamento)\ndef tratar_outliers_truncamento(df):\n    for col in numerical_cols:\n        Q1 = df[col].quantile(0.25)\n        Q3 = df[col].quantile(0.75)\n        IQR = Q3 - Q1\n        lower_bound = Q1 - 1.5 * IQR\n        upper_bound = Q3 + 1.5 * IQR\n        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])\n        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])\n    return df\n'

In [32]:
#Colunas categ√≥ricas
from sklearn.preprocessing import OneHotEncoder

def processar_colunas_categoricas(df):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    df_encoded = pd.DataFrame(encoder.fit_transform(df[categorial_cols]))
    df_encoded.columns = encoder.get_feature_names_out(categorial_cols)
    df = df.drop(categorial_cols, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

train_encoded = processar_colunas_categoricas(train)
test_encoded = processar_colunas_categoricas(test)


In [33]:
# Normaliza√ß√£o das colunas num√©ricas - VERS√ÉO MELHORADA
from sklearn.preprocessing import StandardScaler

def normalizar_colunas_numericas(df, excluir_colunas=None):
    """Normaliza todas as colunas num√©ricas exceto bin√°rias e de ID"""
    df_normalized = df.copy()
    
    if excluir_colunas is None:
        excluir_colunas = []
    
    # Identificar colunas para normalizar
    colunas_para_normalizar = []
    
    for col in df_normalized.columns:
        # Pular se estiver na lista de exclus√£o
        if col in excluir_colunas:
            continue
            
        # Pular colunas de ID (que contenham 'id' no nome)
        if 'id' in col.lower():
            print(f"‚ö†Ô∏è Pulando coluna de ID: {col}")
            continue
            
        # Verificar se √© coluna num√©rica
        if df_normalized[col].dtype in ['int64', 'float64', 'int32', 'float32']:
            # Verificar se √© coluna bin√°ria (s√≥ tem valores 0 e 1)
            valores_unicos = df_normalized[col].dropna().unique()
            
            # Se s√≥ tem 2 valores √∫nicos e s√£o 0 e 1, √© bin√°ria
            if len(valores_unicos) == 2 and set(valores_unicos) == {0, 1}:
                print(f"‚ö†Ô∏è Pulando coluna bin√°ria: {col}")
                continue
            
            # Se s√≥ tem 2 valores √∫nicos mas n√£o s√£o 0 e 1, ainda pode ser categ√≥rica
            elif len(valores_unicos) == 2:
                print(f"‚ö†Ô∏è Pulando coluna com 2 valores √∫nicos: {col} (valores: {valores_unicos})")
                continue
                
            # Se tem mais varia√ß√£o, normalizar
            else:
                colunas_para_normalizar.append(col)
    
    # Aplicar normaliza√ß√£o
    if colunas_para_normalizar:
        scaler = StandardScaler()
        df_normalized[colunas_para_normalizar] = scaler.fit_transform(df_normalized[colunas_para_normalizar])
        print(f"‚úÖ Normalizadas {len(colunas_para_normalizar)} colunas:")
        for col in colunas_para_normalizar:
            print(f"   - {col}")
    else:
        print("‚ö†Ô∏è Nenhuma coluna encontrada para normalizar")
    
    return df_normalized

# Aplicar normaliza√ß√£o
# Para train: excluir a coluna target 'labels'
train_features = train_encoded.drop('labels', axis=1, errors='ignore')
train_normalized = normalizar_colunas_numericas(train_features, excluir_colunas=['labels'])

# Para test: sem exclus√µes espec√≠ficas (n√£o tem labels)
test_normalized = normalizar_colunas_numericas(test_encoded)

# Adicionar de volta a coluna 'labels' ao train
if 'labels' in train_encoded.columns:
    train_normalized['labels'] = train_encoded['labels']

print(f"\nüìä Shapes ap√≥s normaliza√ß√£o:")
print(f"   Train: {train_normalized.shape}")
print(f"   Test: {test_normalized.shape}")

# Verificar tipos de dados finais
print(f"\nüìã Tipos de dados ap√≥s normaliza√ß√£o:")
print("Train:")
print(train_normalized.dtypes.value_counts())
print("\nTest:")
print(test_normalized.dtypes.value_counts())

‚ö†Ô∏è Pulando coluna de ID: id
‚ö†Ô∏è Pulando coluna bin√°ria: is_CA
‚ö†Ô∏è Pulando coluna bin√°ria: is_NY
‚ö†Ô∏è Pulando coluna bin√°ria: is_MA
‚ö†Ô∏è Pulando coluna bin√°ria: is_TX
‚ö†Ô∏è Pulando coluna bin√°ria: is_otherstate
‚ö†Ô∏è Pulando coluna bin√°ria: is_software
‚ö†Ô∏è Pulando coluna bin√°ria: is_web
‚ö†Ô∏è Pulando coluna bin√°ria: is_mobile
‚ö†Ô∏è Pulando coluna bin√°ria: is_enterprise
‚ö†Ô∏è Pulando coluna bin√°ria: is_advertising
‚ö†Ô∏è Pulando coluna de ID: is_gamesvideo
‚ö†Ô∏è Pulando coluna bin√°ria: is_ecommerce
‚ö†Ô∏è Pulando coluna bin√°ria: is_biotech
‚ö†Ô∏è Pulando coluna bin√°ria: is_consulting
‚ö†Ô∏è Pulando coluna bin√°ria: is_othercategory
‚ö†Ô∏è Pulando coluna bin√°ria: has_VC
‚ö†Ô∏è Pulando coluna bin√°ria: has_angel
‚ö†Ô∏è Pulando coluna bin√°ria: has_roundA
‚ö†Ô∏è Pulando coluna bin√°ria: has_roundB
‚ö†Ô∏è Pulando coluna bin√°ria: has_roundC
‚ö†Ô∏è Pulando coluna bin√°ria: has_roundD
‚ö†Ô∏è Pulando coluna bin√°ria: category_code_advertising
‚ö†Ô∏è Pulando 

In [34]:
# MODELAGEM - M√öLTIPLOS ALGORITMOS
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

target_col = 'labels'
train_final = train_normalized.copy()
test_final = test_normalized.copy()

X_train = train_final.drop(target_col, axis=1)
y_train = train_final[target_col]
X_test = test_final.copy()

# 1. Dividir dados para valida√ß√£o
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

modelos = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

resultados = {}

for nome, modelo in modelos.items():
    print(f"\n=== Treinando modelo: {nome} ===")

    cv_scores = cross_val_score(modelo, X_train_split, y_train_split, cv=5, scoring='roc_auc')
    print(f"Cross-validation ROC AUC scores: {cv_scores}")

    #Treinar modelo
    modelo.fit(X_train_split, y_train_split)

    #Prever no conjunto de valida√ß√£o
    y_pred = modelo.predict(X_val)

    #M√©tricas para avaliar modelo
    resultados[nome] = {
        'modelo': modelo,
        'report': classification_report(y_val, y_pred, output_dict=True),
        'confusion_matrix': confusion_matrix(y_val, y_pred),
        'roc_auc': roc_auc_score(y_val, modelo.predict_proba(X_val)[:, 1])
    }
    print("Classification Report:")
    print(classification_report(y_val, y_pred))
    print(f"ROC AUC: {resultados[nome]['roc_auc']}")


=== Treinando modelo: Random Forest ===
Cross-validation ROC AUC scores: [0.75820975 0.85619703 0.86179957 0.76158405 0.68561422]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.59      0.66        68
           1       0.80      0.89      0.84       126

    accuracy                           0.78       194
   macro avg       0.77      0.74      0.75       194
weighted avg       0.78      0.78      0.78       194

ROC AUC: 0.8265056022408963

=== Treinando modelo: Gradient Boosting ===
Cross-validation ROC AUC scores: [0.75158898 0.82944915 0.82273707 0.75538793 0.6799569 ]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.57      0.65        68
           1       0.80      0.90      0.84       126

    accuracy                           0.78       194
   macro avg       0.77      0.74      0.75       194
weighted avg       0.78      0.78      0.78       194

ROC AU

In [35]:
# OTIMIZA√á√ÉO DE HIPERPAR√ÇMETROS
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, roc_auc_score
import numpy as np

print("üöÄ OTIMIZA√á√ÉO DE HIPERPAR√ÇMETROS")
print("="*50)

# 1. TUNING GRADIENT BOOSTING (melhor modelo atual)
print("\nüîß Tuning Gradient Boosting...")

gb_params = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

# RandomizedSearchCV √© mais r√°pido que GridSearchCV
gb_random = RandomizedSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_distributions=gb_params,
    n_iter=100,  # N√∫mero de combina√ß√µes a testar
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

gb_random.fit(X_train_split, y_train_split)

print(f"‚úÖ Melhores par√¢metros GB: {gb_random.best_params_}")
print(f"‚úÖ Melhor score CV: {gb_random.best_score_:.4f}")

# 2. TUNING RANDOM FOREST
print("\nüîß Tuning Random Forest...")

rf_params = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None, 0.8],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}

rf_random = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=rf_params,
    n_iter=80,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rf_random.fit(X_train_split, y_train_split)

print(f"‚úÖ Melhores par√¢metros RF: {rf_random.best_params_}")
print(f"‚úÖ Melhor score CV: {rf_random.best_score_:.4f}")

üöÄ OTIMIZA√á√ÉO DE HIPERPAR√ÇMETROS

üîß Tuning Gradient Boosting...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
‚úÖ Melhores par√¢metros GB: {'subsample': 0.9, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 4, 'learning_rate': 0.05}
‚úÖ Melhor score CV: 0.7936

üîß Tuning Random Forest...
Fitting 5 folds for each of 80 candidates, totalling 400 fits
‚úÖ Melhores par√¢metros RF: {'n_estimators': 300, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'class_weight': None, 'bootstrap': True}
‚úÖ Melhor score CV: 0.7960


In [36]:
# FEATURE ENGINEERING AVAN√áADO
print("\nüõ†Ô∏è FEATURE ENGINEERING AVAN√áADO")
print("="*40)

def criar_features_avancadas(df_train, df_test):
    """Cria features derivadas baseadas nas mais importantes"""
    
    # Copiar dataframes
    train_fe = df_train.copy()
    test_fe = df_test.copy()
    
    # 1. FEATURES DE INTERA√á√ÉO (baseadas na an√°lise de import√¢ncia)
    print("Criando features de intera√ß√£o...")
    
    # Relationships por funding round
    if 'relationships' in train_fe.columns and 'funding_rounds' in train_fe.columns:
        train_fe['relationships_per_round'] = train_fe['relationships'] / (train_fe['funding_rounds'] + 1)
        test_fe['relationships_per_round'] = test_fe['relationships'] / (test_fe['funding_rounds'] + 1)
    
    # Funding total por round
    if 'funding_total_usd' in train_fe.columns and 'funding_rounds' in train_fe.columns:
        train_fe['funding_per_round'] = train_fe['funding_total_usd'] / (train_fe['funding_rounds'] + 1)
        test_fe['funding_per_round'] = test_fe['funding_total_usd'] / (test_fe['funding_rounds'] + 1)
    
    # Milestones por relationship
    if 'milestones' in train_fe.columns and 'relationships' in train_fe.columns:
        train_fe['milestones_per_relationship'] = train_fe['milestones'] / (train_fe['relationships'] + 1)
        test_fe['milestones_per_relationship'] = test_fe['milestones'] / (test_fe['relationships'] + 1)
    
    # 2. FEATURES TEMPORAIS
    print("Criando features temporais...")
    
    # Dura√ß√£o entre primeiro e √∫ltimo funding
    age_cols = [col for col in train_fe.columns if 'age_' in col and 'funding' in col]
    if len(age_cols) >= 2:
        first_funding = [col for col in age_cols if 'first' in col][0]
        last_funding = [col for col in age_cols if 'last' in col][0]
        
        train_fe['funding_duration'] = train_fe[last_funding] - train_fe[first_funding]
        test_fe['funding_duration'] = test_fe[last_funding] - test_fe[first_funding]
    
    # Dura√ß√£o entre primeiro e √∫ltimo milestone
    milestone_cols = [col for col in train_fe.columns if 'age_' in col and 'milestone' in col]
    if len(milestone_cols) >= 2:
        first_milestone = [col for col in milestone_cols if 'first' in col][0]
        last_milestone = [col for col in milestone_cols if 'last' in col][0]
        
        train_fe['milestone_duration'] = train_fe[last_milestone] - train_fe[first_milestone]
        test_fe['milestone_duration'] = test_fe[last_milestone] - test_fe[first_milestone]
    
    # 3. FEATURES DE LOCALIZA√á√ÉO AGREGADAS
    print("Criando features de localiza√ß√£o...")
    
    location_cols = [col for col in train_fe.columns if col.startswith('is_') and any(state in col for state in ['CA', 'NY', 'MA', 'TX'])]
    if location_cols:
        train_fe['is_major_state'] = train_fe[location_cols].sum(axis=1).clip(0, 1)
        test_fe['is_major_state'] = test_fe[location_cols].sum(axis=1).clip(0, 1)
    
    # 4. FEATURES DE CATEGORIA AGREGADAS
    print("Criando features de categoria...")
    
    category_cols = [col for col in train_fe.columns if col.startswith('category_code_')]
    if category_cols:
        train_fe['total_categories'] = train_fe[category_cols].sum(axis=1)
        test_fe['total_categories'] = test_fe[category_cols].sum(axis=1)
    
    # Features de tipo de funding
    funding_type_cols = [col for col in train_fe.columns if col.startswith('has_')]
    if funding_type_cols:
        train_fe['total_funding_types'] = train_fe[funding_type_cols].sum(axis=1)
        test_fe['total_funding_types'] = test_fe[funding_type_cols].sum(axis=1)
    
    # 5. FEATURES DE BINNING
    print("Criando features de binning...")
    
    # Binning para funding_total_usd
    if 'funding_total_usd' in train_fe.columns:
        # Definir bins baseados em quantis
        bins = [-np.inf, train_fe['funding_total_usd'].quantile(0.25), 
                train_fe['funding_total_usd'].quantile(0.5),
                train_fe['funding_total_usd'].quantile(0.75), np.inf]
        
        train_fe['funding_bin'] = pd.cut(train_fe['funding_total_usd'], 
                                       bins=bins, labels=['Low', 'Med_Low', 'Med_High', 'High'])
        test_fe['funding_bin'] = pd.cut(test_fe['funding_total_usd'], 
                                      bins=bins, labels=['Low', 'Med_Low', 'Med_High', 'High'])
        
        # One-hot encoding dos bins
        funding_dummies_train = pd.get_dummies(train_fe['funding_bin'], prefix='funding')
        funding_dummies_test = pd.get_dummies(test_fe['funding_bin'], prefix='funding')
        
        train_fe = pd.concat([train_fe.drop('funding_bin', axis=1), funding_dummies_train], axis=1)
        test_fe = pd.concat([test_fe.drop('funding_bin', axis=1), funding_dummies_test], axis=1)
    
    # 6. POLYNOMIAL FEATURES para vari√°veis mais importantes
    print("Criando polynomial features...")
    
    important_numeric_cols = ['relationships', 'funding_total_usd', 'milestones']
    existing_cols = [col for col in important_numeric_cols if col in train_fe.columns]
    
    for col in existing_cols:
        # Quadrado
        train_fe[f'{col}_squared'] = train_fe[col] ** 2
        test_fe[f'{col}_squared'] = test_fe[col] ** 2
        
        # Log (se valores positivos)
        if (train_fe[col] > 0).all():
            train_fe[f'{col}_log'] = np.log1p(train_fe[col])  # log1p para evitar log(0)
            test_fe[f'{col}_log'] = np.log1p(test_fe[col])
    
    print(f"‚úÖ Feature engineering conclu√≠do!")
    print(f"   Features antes: {df_train.shape[1]}")
    print(f"   Features depois: {train_fe.shape[1]}")
    print(f"   Novas features: {train_fe.shape[1] - df_train.shape[1]}")
    
    return train_fe, test_fe

# Aplicar feature engineering
X_train_fe, X_test_fe = criar_features_avancadas(X_train, X_test)

# Garantir mesmas colunas
common_cols = list(set(X_train_fe.columns) & set(X_test_fe.columns))
X_train_fe = X_train_fe[common_cols]
X_test_fe = X_test_fe[common_cols]

print(f"\nFeatures finais: {X_train_fe.shape[1]}")


üõ†Ô∏è FEATURE ENGINEERING AVAN√áADO
Criando features de intera√ß√£o...
Criando features temporais...
Criando features de localiza√ß√£o...
Criando features de categoria...


KeyError: "['category_code_sports', 'category_code_transportation'] not in index"

In [None]:
# ENSEMBLE METHODS AVAN√áADOS
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

print("\nü§ù ENSEMBLE METHODS AVAN√áADOS")
print("="*40)

# Dividir dados com feature engineering para valida√ß√£o
X_train_fe_split, X_val_fe, y_train_fe_split, y_val_fe = train_test_split(
    X_train_fe, y_train, test_size=0.3, random_state=42, stratify=y_train
)

print(f"üìä Divis√£o dos dados:")
print(f"   Train FE: {X_train_fe_split.shape}")
print(f"   Validation FE: {X_val_fe.shape}")

# 1. VOTING CLASSIFIER (Soft Voting)
print("\n1Ô∏è‚É£ Criando Voting Classifier...")

# Usar modelos otimizados se dispon√≠veis, sen√£o usar padr√µes
try:
    gb_model = gb_random.best_estimator_
    rf_model = rf_random.best_estimator_
    print("   ‚úÖ Usando modelos otimizados")
except:
    gb_model = GradientBoostingClassifier(random_state=42)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    print("   ‚ö†Ô∏è Usando modelos padr√µes (otimiza√ß√£o n√£o dispon√≠vel)")

voting_estimators = [
    ('gb', gb_model),
    ('rf', rf_model),
    ('lr', LogisticRegression(random_state=42, max_iter=1000, C=0.1)),
    ('svm', SVC(probability=True, random_state=42, C=0.1)),
    ('knn', KNeighborsClassifier(n_neighbors=7))
]

voting_clf = VotingClassifier(
    estimators=voting_estimators,
    voting='soft'  # Usa probabilidades
)

print(f"   üìù Estimadores no ensemble: {len(voting_estimators)}")

# 2. STACKING CLASSIFIER
print("\n2Ô∏è‚É£ Criando Stacking Classifier...")

base_estimators = [
    ('gb', gb_model),
    ('rf', rf_model),
    ('lr', LogisticRegression(random_state=42, max_iter=1000)),
    ('svm', SVC(probability=True, random_state=42, C=0.1))
]

stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(random_state=42),
    cv=5,
    stack_method='predict_proba'
)

print(f"   üìù Base estimadores: {len(base_estimators)}")
print(f"   üéØ Meta-estimador: Logistic Regression")

# 3. TREINAR E AVALIAR ENSEMBLE METHODS
print("\n3Ô∏è‚É£ Treinando e avaliando modelos...")

ensembles = {
    'Voting Classifier': voting_clf,
    'Stacking Classifier': stacking_clf,
    'GB Otimizado': gb_model,
    'RF Otimizado': rf_model,
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

ensemble_results = {}

for name, model in ensembles.items():
    print(f"\nüîß Treinando {name}...")
    
    try:
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_fe_split, y_train_fe_split, 
                                   cv=3, scoring='roc_auc', n_jobs=-1)  # Reduzido para 3 CV
        
        # Treinar modelo
        model.fit(X_train_fe_split, y_train_fe_split)
        
        # Predi√ß√µes
        y_pred = model.predict(X_val_fe)
        y_pred_proba = model.predict_proba(X_val_fe)[:, 1]
        
        # M√©tricas
        auc = roc_auc_score(y_val_fe, y_pred_proba)
        accuracy = accuracy_score(y_val_fe, y_pred)
        f1 = f1_score(y_val_fe, y_pred)
        
        ensemble_results[name] = {
            'model': model,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'val_auc': auc,
            'val_accuracy': accuracy,
            'val_f1': f1
        }
        
        print(f"   üìä CV AUC: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
        print(f"   üéØ Val AUC: {auc:.4f}")
        print(f"   üìà Val Accuracy: {accuracy:.4f}")
        print(f"   ‚öñÔ∏è Val F1: {f1:.4f}")
        
    except Exception as e:
        print(f"   ‚ùå Erro treinando {name}: {str(e)}")
        continue

# Compara√ß√£o final
print(f"\nüìä COMPARA√á√ÉO FINAL DE MODELOS")
print("="*70)
print(f"{'Modelo':<20} {'AUC':<8} {'Accuracy':<10} {'F1':<8} {'CV AUC':<10}")
print("="*70)

for name, results in ensemble_results.items():
    print(f"{name:<20} {results['val_auc']:.4f}   {results['val_accuracy']:.4f}     {results['val_f1']:.4f}   {results['cv_mean']:.4f}")

# Melhor modelo
if ensemble_results:
    best_model_name = max(ensemble_results.keys(), 
                         key=lambda x: ensemble_results[x]['val_auc'])
    best_model = ensemble_results[best_model_name]['model']
    best_auc = ensemble_results[best_model_name]['val_auc']

    print(f"\nüèÜ MELHOR MODELO: {best_model_name}")
    print(f"   üéØ AUC: {best_auc:.4f}")
    print(f"   üìà Accuracy: {ensemble_results[best_model_name]['val_accuracy']:.4f}")
    print(f"   ‚öñÔ∏è F1-Score: {ensemble_results[best_model_name]['val_f1']:.4f}")
else:
    print("\n‚ùå Nenhum modelo foi treinado com sucesso")
    best_model_name = "Gradient Boosting"
    best_model = gb_model
    best_auc = 0.0

In [None]:
# FEATURE SELECTION E AN√ÅLISE DE IMPORT√ÇNCIA
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.inspection import permutation_importance

print("\nüîç FEATURE SELECTION E AN√ÅLISE")
print("="*40)

# 1. AN√ÅLISE DE IMPORT√ÇNCIA
print("\n1Ô∏è‚É£ Analisando import√¢ncia das features...")

# Usar o melhor modelo para an√°lise de import√¢ncia
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    feature_names = X_train_fe.columns
    
    # DataFrame com import√¢ncias
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("üéØ TOP 15 FEATURES MAIS IMPORTANTES:")
    print("="*60)
    for i, (_, row) in enumerate(importance_df.head(15).iterrows(), 1):
        feature_type = ""
        if any(suffix in row['feature'] for suffix in ['_squared', '_log']):
            feature_type = " üî¢[POLY]"
        elif any(suffix in row['feature'] for suffix in ['_per_', '_duration', 'total_']):
            feature_type = " üîó[INTER]"
        elif 'funding_' in row['feature'] and any(bin_name in row['feature'] for bin_name in ['Low', 'Med', 'High']):
            feature_type = " üìä[BIN]"
        else:
            feature_type = " üìà[ORIG]"
        
        print(f"{i:2d}. {row['feature']:<35}: {row['importance']:.6f}{feature_type}")
    
    # An√°lise por tipo de feature
    print(f"\nüìä AN√ÅLISE POR TIPO DE FEATURE:")
    original_features = importance_df[~importance_df['feature'].str.contains('_squared|_log|_per_|_duration|total_|funding_Low|funding_Med|funding_High')]
    poly_features = importance_df[importance_df['feature'].str.contains('_squared|_log')]
    interaction_features = importance_df[importance_df['feature'].str.contains('_per_|_duration|total_')]
    binning_features = importance_df[importance_df['feature'].str.contains('funding_Low|funding_Med|funding_High')]
    
    original_importance = original_features['importance'].sum()
    poly_importance = poly_features['importance'].sum()
    interaction_importance = interaction_features['importance'].sum()
    binning_importance = binning_features['importance'].sum()
    
    print(f"   üìà Features Originais: {original_importance:.1%} ({len(original_features)} features)")
    print(f"   üî¢ Features Polinomiais: {poly_importance:.1%} ({len(poly_features)} features)")
    print(f"   üîó Features de Intera√ß√£o: {interaction_importance:.1%} ({len(interaction_features)} features)")
    print(f"   üìä Features de Binning: {binning_importance:.1%} ({len(binning_features)} features)")
    
    # 2. FEATURE SELECTION BASEADA EM IMPORT√ÇNCIA
    print(f"\n2Ô∏è‚É£ Selecionando features mais importantes...")
    
    # Selecionar features que representam 95% da import√¢ncia cumulativa
    importance_df_sorted = importance_df.sort_values('importance', ascending=False)
    cumulative_importance = importance_df_sorted['importance'].cumsum()
    n_features_95 = (cumulative_importance <= 0.95).sum()
    n_features_selected = max(20, min(n_features_95, len(feature_names) // 2))
    
    top_features = importance_df_sorted.head(n_features_selected)['feature'].tolist()
    
    X_train_selected = X_train_fe[top_features]
    X_test_selected = X_test_fe[top_features]
    
    print(f"   üìù Selecionadas {len(top_features)} features de {len(feature_names)}")
    print(f"   üìä Representam {cumulative_importance.iloc[n_features_selected-1]:.1%} da import√¢ncia total")
    
    # 3. RETREINAR MODELO COM FEATURES SELECIONADAS
    print(f"\n3Ô∏è‚É£ Retreinando modelo com features selecionadas...")
    
    X_train_sel_split, X_val_sel, y_train_sel_split, y_val_sel = train_test_split(
        X_train_selected, y_train, test_size=0.3, random_state=42, stratify=y_train
    )
    
    # Criar nova inst√¢ncia do melhor modelo
    if best_model_name == 'Voting Classifier':
        selected_model = VotingClassifier(
            estimators=voting_estimators,
            voting='soft'
        )
    elif best_model_name == 'Stacking Classifier':
        selected_model = StackingClassifier(
            estimators=base_estimators,
            final_estimator=LogisticRegression(random_state=42),
            cv=3,
            stack_method='predict_proba'
        )
    else:
        # Para modelos individuais, criar nova inst√¢ncia
        if hasattr(best_model, 'get_params'):
            params = best_model.get_params()
            selected_model = type(best_model)(**params)
        else:
            selected_model = best_model
    
    # Treinar com features selecionadas
    selected_model.fit(X_train_sel_split, y_train_sel_split)
    
    # Avaliar
    y_pred_sel_proba = selected_model.predict_proba(X_val_sel)[:, 1]
    y_pred_sel = selected_model.predict(X_val_sel)
    
    auc_selected = roc_auc_score(y_val_sel, y_pred_sel_proba)
    accuracy_selected = accuracy_score(y_val_sel, y_pred_sel)
    f1_selected = f1_score(y_val_sel, y_pred_sel)
    
    print(f"   üìä Compara√ß√£o de Performance:")
    print(f"   AUC com todas as features: {best_auc:.4f}")
    print(f"   AUC com features selecionadas: {auc_selected:.4f}")
    print(f"   Diferen√ßa AUC: {auc_selected - best_auc:+.4f}")
    
    # Escolher a melhor vers√£o
    if auc_selected > best_auc:
        print("   ‚úÖ Feature selection melhorou o modelo!")
        final_model = selected_model
        final_X_train = X_train_selected
        final_X_test = X_test_selected
        final_auc = auc_selected
        final_accuracy = accuracy_selected
        final_f1 = f1_selected
        features_used = len(top_features)
    else:
        print("   ‚úÖ Modelo original √© melhor, mantendo todas as features")
        final_model = best_model
        final_X_train = X_train_fe
        final_X_test = X_test_fe
        final_auc = best_auc
        final_accuracy = ensemble_results[best_model_name]['val_accuracy']
        final_f1 = ensemble_results[best_model_name]['val_f1']
        features_used = len(X_train_fe.columns)

else:
    print("   ‚ö†Ô∏è Modelo n√£o suporta feature_importances_")
    
    # Usar SelectKBest como alternativa
    print("   üîÑ Usando SelectKBest como alternativa...")
    
    k_features = min(50, X_train_fe.shape[1] // 2)
    selector = SelectKBest(score_func=f_classif, k=k_features)
    
    X_train_selected = selector.fit_transform(X_train_fe, y_train)
    X_test_selected = selector.transform(X_test_fe)
    
    # Converter de volta para DataFrame
    selected_feature_names = X_train_fe.columns[selector.get_support()]
    X_train_selected = pd.DataFrame(X_train_selected, columns=selected_feature_names, index=X_train_fe.index)
    X_test_selected = pd.DataFrame(X_test_selected, columns=selected_feature_names, index=X_test_fe.index)
    
    final_model = best_model
    final_X_train = X_train_selected
    final_X_test = X_test_selected
    final_auc = best_auc
    features_used = k_features
    
    print(f"   üìù Selecionadas {k_features} features usando SelectKBest")

print(f"\n‚úÖ FEATURE SELECTION CONCLU√çDA!")
print(f"   üèÜ Modelo final: {best_model_name}")
print(f"   üìä Features utilizadas: {features_used}")
print(f"   üéØ AUC final: {final_auc:.4f}")

# 4. AN√ÅLISE DAS FEATURES MAIS IMPACTANTES
if hasattr(final_model, 'feature_importances_') and len(final_X_train.columns) <= 30:
    print(f"\n4Ô∏è‚É£ TOP 10 FEATURES DO MODELO FINAL:")
    print("="*50)
    
    final_importances = pd.DataFrame({
        'feature': final_X_train.columns,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    for i, (_, row) in enumerate(final_importances.head(10).iterrows(), 1):
        print(f"{i:2d}. {row['feature']:<30}: {row['importance']:.6f}")

In [None]:
# SUBMISS√ÉO FINAL OTIMIZADA
print(f"\nüìù GERANDO SUBMISS√ÉO FINAL OTIMIZADA")
print("="*60)

print(f"üèÜ Modelo final escolhido: {best_model_name}")
print(f"üìä AUC de valida√ß√£o: {final_auc:.4f}")
print(f"üìà Accuracy de valida√ß√£o: {final_accuracy:.4f}")
print(f"‚öñÔ∏è F1-Score de valida√ß√£o: {final_f1:.4f}")
print(f"üî¢ Features utilizadas: {features_used}")

# Treinar modelo final em TODOS os dados
print(f"\nüîß Treinando modelo final em todos os dados de treino...")
print(f"   Shape dos dados finais:")
print(f"   X_train: {final_X_train.shape}")
print(f"   y_train: {y_train.shape}")
print(f"   X_test: {final_X_test.shape}")

try:
    final_model.fit(final_X_train, y_train)
    print("‚úÖ Modelo final treinado com sucesso!")
    
    # Fazer predi√ß√µes no test set
    print("\nüéØ Gerando predi√ß√µes finais...")
    y_pred_final_proba = final_model.predict_proba(final_X_test)[:, 1]
    
    # Usar threshold otimizado (0.5 padr√£o, mas pode ser ajustado)
    threshold = 0.5
    y_pred_final = (y_pred_final_proba > threshold).astype(int)
    
    print(f"   üìä Probabilidades geradas: {len(y_pred_final_proba)}")
    print(f"   üéØ Threshold usado: {threshold}")
    print(f"   üìà Predi√ß√µes bin√°rias: {len(y_pred_final)}")
    
    # Verificar distribui√ß√£o das predi√ß√µes
    success_rate_pred = y_pred_final.mean()
    success_rate_train = y_train.mean()
    
    print(f"\nüìä An√°lise das predi√ß√µes:")
    print(f"   Taxa de sucesso no treino: {success_rate_train:.1%}")
    print(f"   Taxa de sucesso predita: {success_rate_pred:.1%}")
    print(f"   Diferen√ßa: {success_rate_pred - success_rate_train:+.1%}")
    
    # Estat√≠sticas das probabilidades
    print(f"\nüìà Estat√≠sticas das probabilidades:")
    print(f"   M√©dia: {y_pred_final_proba.mean():.4f}")
    print(f"   Mediana: {np.median(y_pred_final_proba):.4f}")
    print(f"   M√≠n: {y_pred_final_proba.min():.4f}")
    print(f"   M√°x: {y_pred_final_proba.max():.4f}")
    print(f"   Std: {y_pred_final_proba.std():.4f}")
    
    # Criar submission
    print(f"\nüìÅ Criando arquivo de submiss√£o...")
    
    # Verificar se existe coluna ID
    if 'id' in test.columns:
        id_col = test['id']
        print("   ‚úÖ Usando coluna 'id' do dataset original")
    else:
        id_col = range(len(test))
        print("   ‚ö†Ô∏è Criando IDs sequenciais")
    
    submission_optimized = pd.DataFrame({
        'id': id_col,
        'labels': y_pred_final
    })
    
    # Nome do arquivo com informa√ß√µes do modelo
    model_short_name = best_model_name.lower().replace(' ', '_').replace('classifier', 'clf')
    filename = f'submission_optimized_{model_short_name}_auc_{final_auc:.4f}.csv'
    
    # Salvar arquivo
    submission_optimized.to_csv(filename, index=False)
    print(f"‚úÖ Submiss√£o salva em: {filename}")
    
    # Informa√ß√µes do arquivo
    print(f"\nüìã INFORMA√á√ïES DA SUBMISS√ÉO:")
    print(f"   üìÑ Arquivo: {filename}")
    print(f"   üìä Linhas: {len(submission_optimized)}")
    print(f"   üìã Colunas: {list(submission_optimized.columns)}")
    print(f"   üéØ Valores √∫nicos em 'labels': {sorted(submission_optimized['labels'].unique())}")
    
    print("\nüìà Distribui√ß√£o das predi√ß√µes:")
    value_counts = submission_optimized['labels'].value_counts().sort_index()
    for label, count in value_counts.items():
        percentage = count / len(submission_optimized) * 100
        label_name = "Fracasso" if label == 0 else "Sucesso"
        print(f"   {label} ({label_name}): {count} ({percentage:.1f}%)")
    
    # Mostrar primeiras linhas
    print(f"\nüìù Primeiras 10 linhas da submiss√£o:")
    print(submission_optimized.head(10))
    
    print(f"\nüéâ SUBMISS√ÉO OTIMIZADA CONCLU√çDA!")
    print("="*60)
    print(f"üèÜ Modelo: {best_model_name}")
    print(f"üìä AUC esperado: {final_auc:.4f}")
    print(f"üìà Accuracy esperada: {final_accuracy:.4f}")
    print(f"‚öñÔ∏è F1-Score esperado: {final_f1:.4f}")
    print(f"üî¢ Features: {features_used}")
    print(f"üìÅ Arquivo: {filename}")
    print(f"üöÄ Melhoria estimada: +{final_auc - 0.830:.3f} AUC vs modelo base")
    print(f"üí° Pronto para upload no Kaggle!")
    
    # Comparar com resultados anteriores se dispon√≠vel
    if 'resultados' in globals():
        print(f"\nüìä COMPARA√á√ÉO COM MODELOS ANTERIORES:")
        print("-"*50)
        for nome, resultado in resultados.items():
            old_auc = resultado['roc_auc']
            improvement = final_auc - old_auc
            print(f"   {nome}: {old_auc:.4f} ‚Üí {final_auc:.4f} ({improvement:+.4f})")
    
except Exception as e:
    print(f"‚ùå Erro ao treinar modelo final: {str(e)}")
    print("üîÑ Tentando com modelo de backup...")
    
    # Modelo de backup simples
    backup_model = GradientBoostingClassifier(random_state=42)
    backup_model.fit(X_train_fe, y_train)
    y_pred_backup = backup_model.predict_proba(X_test_fe)[:, 1]
    
    submission_backup = pd.DataFrame({
        'id': test['id'] if 'id' in test.columns else range(len(test)),
        'labels': (y_pred_backup > 0.5).astype(int)
    })
    
    backup_filename = 'submission_backup_gb.csv'
    submission_backup.to_csv(backup_filename, index=False)
    print(f"‚úÖ Submiss√£o de backup salva em: {backup_filename}")