In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install wandb # for model tuning via weights and biases



In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report
import wandb
import random
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# device-agnostic code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Data Loading & Augmentation

In [None]:
# Load data
train = pd.read_csv('/content/drive/MyDrive/BDML-2024/P-Set2/data/imputed_no_missings/train_ready.csv')
test = pd.read_csv('/content/drive/MyDrive/BDML-2024/P-Set2/data/imputed_no_missings/test_ready.csv')

train.shape, test.shape

((164960, 49), (66168, 42))

In [None]:
train.columns

Index(['id', 'Clase', 'Dominio', 'P5000', 'P5010', 'P5090', 'Nper', 'Npersug',
       'Ingtotug', 'Ingtotugarr', 'Ingpcug', 'Li', 'Lp', 'Pobre', 'Indigente',
       'Npobres', 'Nindigentes', 'Fex_c', 'Depto', 'Fex_dpto',
       'mean_age_household', 'has_social_program', 'educ_attainment',
       'P6240_someone_works', 'P6240_unemployment_rate',
       'P6240_main_household_activity', 'P6240_activity_diversity',
       'has_food_subsidy', 'has_transport_subsidy', 'has_family_subsidy',
       'has_school_subsidy', 'total_subsidies', 'has_pension_contributor',
       'has_pensioner', 'prop_pension_contributors', 'prop_pensioners',
       'pension_status', 'household_size', 'dependency_ratio',
       'is_female_headed', 'has_health_insurance', 'main_insurance_type',
       'insurance_coverage_rate', 'num_individuals', 'num_under_18',
       'num_over_65', 'has_over_65', 'has_university_education',
       'receives_food_payment'],
      dtype='object')

In [None]:
categorical_features = [
    'Dominio', 'P6240_someone_works', 'pension_status', 'is_female_headed', 'has_health_insurance',
    'Clase', 'has_social_program', 'educ_attainment', 'P6240_main_household_activity',
    'has_food_subsidy', 'has_transport_subsidy', 'has_family_subsidy',
    'has_school_subsidy', 'has_pension_contributor', 'has_pensioner',
    'main_insurance_type', 'has_university_education', 'receives_food_payment'
]

numeric_features = [
    'P5000', 'P5010', 'P5090', 'Nper', 'Npersug', 'Li', 'Lp', 'Fex_c', 'Depto', 'Fex_dpto',
    'mean_age_household', 'P6240_unemployment_rate', 'P6240_activity_diversity',
    'total_subsidies', 'prop_pension_contributors', 'prop_pensioners', 'household_size',
    'dependency_ratio', 'insurance_coverage_rate', 'num_individuals', 'num_under_18',
    'num_over_65', 'has_over_65'
]

In [None]:
def create_preprocessor(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    return preprocessor

In [None]:
from scipy.sparse import issparse, csr_matrix

class SparseDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels=None):
        self.features = features
        self.labels = labels

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.features[idx].toarray().squeeze()) if issparse(self.features) else torch.FloatTensor(self.features[idx])
        if self.labels is not None:
            y = torch.LongTensor([self.labels[idx]]).squeeze()
            return x, y
        return x

# 2. Dataset & Model Definition

In [None]:
class HouseholdDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.FloatTensor(features)
        if labels is not None:
            self.labels = torch.LongTensor(labels)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

class PovertyPredictor(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, dropout):
        super(PovertyPredictor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size1),
            nn.BatchNorm1d(hidden_size1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size1, hidden_size2),
            nn.BatchNorm1d(hidden_size2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size2, hidden_size3),
            nn.BatchNorm1d(hidden_size3),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size3, 2)
        )

    def forward(self, x):
        return self.layers(x)

# 3. Training & Eval Functions

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=100, patience=10):
    best_model = None
    best_val_f1 = 0
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        model.train()
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation
        val_f1 = evaluate_model(model, val_loader, device)
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation F1: {val_f1:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            epochs_no_improve = 0
            best_model = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping!")
                model.load_state_dict(best_model)
                return model, best_val_f1

    model.load_state_dict(best_model)
    return model, best_val_f1

def make_predictions(model, X, device):
    model.eval()
    dataset = SparseDataset(X)
    dataloader = DataLoader(dataset, batch_size=32)
    all_preds = []

    with torch.no_grad():
        for features in dataloader:
            features = features.to(device)
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())

    return np.array(all_preds)

In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for features, labels in data_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return f1_score(all_labels, all_preds)

# 4. Base Model Training
(Un-tunned)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

def train_base_model(features, labels, model_params, device):
    print(f"Using {len(numeric_features)} numeric features: {numeric_features}")
    print(f"Using {len(categorical_features)} categorical features: {categorical_features}")

    preprocessor = create_preprocessor(numeric_features, categorical_features)

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)

    # Preprocess features
    X_train_processed = preprocessor.fit_transform(X_train)
    X_val_processed = preprocessor.transform(X_val)

    print("Original training set shape:", Counter(y_train))

    # Apply SMOTE to the training set
    smote = SMOTE(random_state=42)
    if issparse(X_train_processed):
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed.toarray(), y_train)
        X_train_resampled = csr_matrix(X_train_resampled)
    else:
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

    print("Resampled training set shape:", Counter(y_train_resampled))

    # Create datasets and dataloaders
    train_dataset = SparseDataset(X_train_resampled, y_train_resampled)
    val_dataset = SparseDataset(X_val_processed, y_val.values)

    train_loader = DataLoader(train_dataset, batch_size=model_params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=model_params['batch_size'])

    # Initialize model, loss, and optimizer
    input_size = X_train_resampled.shape[1]
    model = PovertyPredictor(
        input_size=input_size,
        hidden_size1=model_params['hidden_size1'],
        hidden_size2=model_params['hidden_size2'],
        hidden_size3=model_params['hidden_size3'],
        dropout=model_params['dropout']
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=model_params['learning_rate'], weight_decay=model_params['weight_decay'])

    # Train and evaluate
    trained_model, best_val_f1 = train_model(model, train_loader, val_loader, criterion, optimizer, device,
                                             num_epochs=model_params['num_epochs'], patience=model_params['patience'])

    print(f"Best Validation F1: {best_val_f1:.4f}")

    # Detailed evaluation on validation set
    final_val_f1 = evaluate_model(trained_model, val_loader, device)
    print(f"\nFinal Validation F1 Score: {final_val_f1:.4f}")

    print("\nClassification Report:")
    y_pred = make_predictions(trained_model, X_val_processed, device)
    print(classification_report(y_val, y_pred, target_names=['Not Poor', 'Poor']))

    return trained_model, preprocessor, best_val_f1

# Modify the make_predictions function to work with sparse data
def make_predictions(model, X, preprocessor, device):
    model.eval()
    # Preprocess the features
    X_processed = preprocessor.transform(X)

    dataset = SparseDataset(X_processed)
    dataloader = DataLoader(dataset, batch_size=32)
    all_preds = []

    with torch.no_grad():
        for features in dataloader:
            features = features.to(device)
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())

    return np.array(all_preds)

def format_predictions(test_df, predictions):
    # Create a new DataFrame with 'id' and 'pobre' columns
    result_df = pd.DataFrame({
        'id': test_df['id'],
        'pobre': predictions
    })
    return result_df

In [None]:
model_params = {
    'batch_size': 32,
    'hidden_size1': 128,
    'hidden_size2': 64,
    'hidden_size3': 32,
    'dropout': 0.3,
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'num_epochs': 100,
    'patience': 10
}

features = train.drop(['Pobre', 'id'], axis=1)
labels = train['Pobre']
best_model, best_preprocessor, best_f1 = train_base_model(features, labels, model_params, device)
print(f"Best Overall Validation F1 Score: {best_f1:.4f}")

Using 23 numeric features: ['P5000', 'P5010', 'P5090', 'Nper', 'Npersug', 'Li', 'Lp', 'Fex_c', 'Depto', 'Fex_dpto', 'mean_age_household', 'P6240_unemployment_rate', 'P6240_activity_diversity', 'total_subsidies', 'prop_pension_contributors', 'prop_pensioners', 'household_size', 'dependency_ratio', 'insurance_coverage_rate', 'num_individuals', 'num_under_18', 'num_over_65', 'has_over_65']
Using 18 categorical features: ['Dominio', 'P6240_someone_works', 'pension_status', 'is_female_headed', 'has_health_insurance', 'Clase', 'has_social_program', 'educ_attainment', 'P6240_main_household_activity', 'has_food_subsidy', 'has_transport_subsidy', 'has_family_subsidy', 'has_school_subsidy', 'has_pension_contributor', 'has_pensioner', 'main_insurance_type', 'has_university_education', 'receives_food_payment']
Original training set shape: Counter({0: 105549, 1: 26419})
Resampled training set shape: Counter({1: 105549, 0: 105549})
Epoch 1/100, Validation F1: 0.6290
Epoch 2/100, Validation F1: 0.633

# 6. Model Tuning – Weights & Biases

In [None]:
def wandb_tuning(features, labels, sweep_config, project_name):
    def train():
        with wandb.init():
            config = wandb.config

            print(f"Using {len(numeric_features)} numeric features: {numeric_features}")
            print(f"Using {len(categorical_features)} categorical features: {categorical_features}")

            preprocessor = create_preprocessor(numeric_features, categorical_features)

            # Split data into train and validation sets
            X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)

            # Preprocess features
            X_train_processed = preprocessor.fit_transform(X_train)
            X_val_processed = preprocessor.transform(X_val)

            print("Original training set shape:", Counter(y_train))

            # Apply SMOTE to the training set
            smote = SMOTE(random_state=42)
            if issparse(X_train_processed):
                X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed.toarray(), y_train)
                X_train_resampled = csr_matrix(X_train_resampled)
            else:
                X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

            print("Resampled training set shape:", Counter(y_train_resampled))

            # Create datasets and dataloaders
            train_dataset = SparseDataset(X_train_resampled, y_train_resampled)
            val_dataset = SparseDataset(X_val_processed, y_val.values)

            train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=config.batch_size)

            # Initialize model, loss, and optimizer
            input_size = X_train_resampled.shape[1]
            model = PovertyPredictor(
                input_size=input_size,
                hidden_size1=config.hidden_size1,
                hidden_size2=config.hidden_size2,
                hidden_size3=config.hidden_size3,
                dropout=config.dropout
            ).to(device)

            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)

            # Train and evaluate
            trained_model, best_val_f1 = train_model(model, train_loader, val_loader, criterion, optimizer, device,
                                                     num_epochs=config.num_epochs, patience=config.patience)

            print(f"Best Validation F1: {best_val_f1:.4f}")

            # Detailed evaluation on validation set
            final_val_f1 = evaluate_model(trained_model, val_loader, device)
            print(f"\nFinal Validation F1 Score: {final_val_f1:.4f}")

            print("\nClassification Report:")
            y_pred = make_predictions(trained_model, X_val_processed, device)
            print(classification_report(y_val, y_pred, target_names=['Not Poor', 'Poor']))

            # Log metrics
            wandb.log({
                'best_val_f1': best_val_f1,
                'final_val_f1': final_val_f1
            })

            return best_val_f1

    sweep_id = wandb.sweep(sweep_config, project=project_name)
    wandb.agent(sweep_id, function=train, count=75)

# Helper function for making predictions
def make_predictions(model, features, device):
    model.eval()
    with torch.no_grad():
        features_tensor = torch.FloatTensor(features).to(device)
        outputs = model(features_tensor)
        _, preds = torch.max(outputs, 1)
    return preds.cpu().numpy()

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'average_val_f1', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {'min': 3e-4, 'max': 1e-2},
        'batch_size': {'values': [32, 64]},
        'hidden_size1': {'values': [128, 256]},
        'hidden_size2': {'values': [32, 64, 128]},
        'hidden_size3': {'values': [16, 32, 64]},
        'dropout': {'min': 0.1, 'max': 0.5},
        'weight_decay': {'min': 7e-6, 'max': 1e-4},
        'num_epochs': {'value': 100},
        'patience': {'value': 10}
    }
}

features = train.drop(['Pobre', 'id'], axis=1)
labels = train['Pobre']
wandb_tuning(features, labels, sweep_config, "poverty-prediction-nn")

# xxxx

Create sweep with ID: 0xwq0uy6
Sweep URL: https://wandb.ai/edmundo-research/poverty-prediction-nn/sweeps/0xwq0uy6


[34m[1mwandb[0m: Agent Starting Run: l80llau6 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.35489683219666635
[34m[1mwandb[0m: 	hidden_size1: 128
[34m[1mwandb[0m: 	hidden_size2: 128
[34m[1mwandb[0m: 	hidden_size3: 16
[34m[1mwandb[0m: 	learning_rate: 0.002369832839948444
[34m[1mwandb[0m: 	num_epochs: 100
[34m[1mwandb[0m: 	patience: 10
[34m[1mwandb[0m: 	weight_decay: 3.286923723509476e-05


Using 23 numeric features: ['P5000', 'P5010', 'P5090', 'Nper', 'Npersug', 'Li', 'Lp', 'Fex_c', 'Depto', 'Fex_dpto', 'mean_age_household', 'P6240_unemployment_rate', 'P6240_activity_diversity', 'total_subsidies', 'prop_pension_contributors', 'prop_pensioners', 'household_size', 'dependency_ratio', 'insurance_coverage_rate', 'num_individuals', 'num_under_18', 'num_over_65', 'has_over_65']
Using 18 categorical features: ['Dominio', 'P6240_someone_works', 'pension_status', 'is_female_headed', 'has_health_insurance', 'Clase', 'has_social_program', 'educ_attainment', 'P6240_main_household_activity', 'has_food_subsidy', 'has_transport_subsidy', 'has_family_subsidy', 'has_school_subsidy', 'has_pension_contributor', 'has_pensioner', 'main_insurance_type', 'has_university_education', 'receives_food_payment']
Original training set shape: Counter({0: 105549, 1: 26419})
Resampled training set shape: Counter({1: 105549, 0: 105549})
Epoch 1/100, Validation F1: 0.6393
Epoch 2/100, Validation F1: 0.647

Traceback (most recent call last):
  File "<ipython-input-22-98f8402f65ca>", line 61, in train
    y_pred = make_predictions(trained_model, X_val_processed, device)
  File "<ipython-input-22-98f8402f65ca>", line 79, in make_predictions
    features_tensor = torch.FloatTensor(features).to(device)
  File "/usr/local/lib/python3.10/dist-packages/scipy/sparse/_base.py", line 404, in __len__
    raise TypeError("sparse array length is ambiguous; use getnnz()"
TypeError: sparse array length is ambiguous; use getnnz() or shape[0]


VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run l80llau6 errored:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "<ipython-input-22-98f8402f65ca>", line 61, in train
    y_pred = make_predictions(trained_model, X_val_processed, device)
  File "<ipython-input-22-98f8402f65ca>", line 79, in make_predictions
    features_tensor = torch.FloatTensor(features).to(device)
  File "/usr/local/lib/python3.10/dist-packages/scipy/sparse/_base.py", line 404, in __len__
    raise TypeError("sparse array length is ambiguous; use getnnz()"
TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

[34m[1mwandb[0m: [32m[41mERROR[0m Run l80llau6 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.10/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function

Using 23 numeric features: ['P5000', 'P5010', 'P5090', 'Nper', 'Npersug', 'Li', 'Lp', 'Fex_c', 'Depto', 'Fex_dpto', 'mean_age_household', 'P6240_unemployment_rate', 'P6240_activity_diversity', 'total_subsidies', 'prop_pension_contributors', 'prop_pensioners', 'household_size', 'dependency_ratio', 'insurance_coverage_rate', 'num_individuals', 'num_under_18', 'num_over_65', 'has_over_65']
Using 18 categorical features: ['Dominio', 'P6240_someone_works', 'pension_status', 'is_female_headed', 'has_health_insurance', 'Clase', 'has_social_program', 'educ_attainment', 'P6240_main_household_activity', 'has_food_subsidy', 'has_transport_subsidy', 'has_family_subsidy', 'has_school_subsidy', 'has_pension_contributor', 'has_pensioner', 'main_insurance_type', 'has_university_education', 'receives_food_payment']
Original training set shape: Counter({0: 105549, 1: 26419})
Resampled training set shape: Counter({1: 105549, 0: 105549})
Epoch 1/100, Validation F1: 0.6376
Epoch 2/100, Validation F1: 0.604

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# 7. Final Training & Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def calculate_f1(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for features, labels in data_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return f1_score(all_labels, all_preds)

def train_final_model(features, labels, best_params):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Select only numeric columns
    numeric_columns = features.select_dtypes(include=['int64', 'float64']).columns
    features_numeric = features[numeric_columns].values

    print(f"Selected {len(numeric_columns)} numeric features: {numeric_columns.tolist()}")

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(features_numeric, labels, test_size=0.2, random_state=42)

    # Initialize the model with best parameters
    input_size = X_train.shape[1]
    model = PovertyPredictor(
        input_size=input_size,
        hidden_size1=best_params['hidden_size1'],
        hidden_size2=best_params['hidden_size2'],
        hidden_size3=best_params['hidden_size3'],
        dropout=best_params['dropout']
    ).to(device)

    # Prepare the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    train_dataset = HouseholdDataset(X_train_scaled, y_train.values)
    val_dataset = HouseholdDataset(X_val_scaled, y_val.values)

    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=best_params['batch_size'], shuffle=False)

    # Calculate class weights
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights = torch.FloatTensor(class_weights).to(device)
    print(f"Class weights: {class_weights}")

    # Set up optimizer and criterion
    optimizer = optim.Adam(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    # Early stopping parameters
    patience = 10
    best_f1 = 0
    counter = 0
    best_model_state = None

    # Train the model
    for epoch in range(best_params['num_epochs']):
        model.train()
        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

        # Calculate F1 score on validation set
        val_f1 = calculate_f1(model, val_loader, device)

        # Early stopping logic
        if val_f1 > best_f1:
            best_f1 = val_f1
            counter = 0
            best_model_state = model.state_dict()
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

        # Print epoch F1 score
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{best_params['num_epochs']}], Validation F1: {val_f1:.4f}")

    # Load the best model state
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return model, scaler

In [None]:
# Usage in the main script
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example usage -- silvery-sweep-19 [best_model_n16922w7]
best_params = {
    'batch_size': 32,
    'hidden_size1': 256,
    'hidden_size2': 128,
    'hidden_size3': 32,
    'dropout': 0.15898988565127214,
    'learning_rate': 0.0006540550101305035,
    'weight_decay': 0.00001273061095884153,
    'num_epochs': 100,
    'patience': 10
}

features = train.drop(['Pobre', 'id'], axis=1)
labels = train['Pobre']
best_model, best_preprocessor, best_f1 = train_base_model(features, labels, best_params, device)  # shwitched 'model_params' for 'best_params'
print(f"Best Overall Validation F1 Score: {best_f1:.4f}")

Using 23 numeric features: ['P5000', 'P5010', 'P5090', 'Nper', 'Npersug', 'Li', 'Lp', 'Fex_c', 'Depto', 'Fex_dpto', 'mean_age_household', 'P6240_unemployment_rate', 'P6240_activity_diversity', 'total_subsidies', 'prop_pension_contributors', 'prop_pensioners', 'household_size', 'dependency_ratio', 'insurance_coverage_rate', 'num_individuals', 'num_under_18', 'num_over_65', 'has_over_65']
Using 18 categorical features: ['Dominio', 'P6240_someone_works', 'pension_status', 'is_female_headed', 'has_health_insurance', 'Clase', 'has_social_program', 'educ_attainment', 'P6240_main_household_activity', 'has_food_subsidy', 'has_transport_subsidy', 'has_family_subsidy', 'has_school_subsidy', 'has_pension_contributor', 'has_pensioner', 'main_insurance_type', 'has_university_education', 'receives_food_payment']
Original training set shape: Counter({0: 105549, 1: 26419})
Resampled training set shape: Counter({1: 105549, 0: 105549})
Epoch 1/100, Validation F1: 0.6369
Epoch 2/100, Validation F1: 0.645

In [None]:
# make preds on test
test_features = test.drop(['id'], axis=1)
predictions = make_predictions(best_model, test_features, best_preprocessor, device)

# Format the predictions
result_df = format_predictions(test, predictions)

In [None]:
submission = result_df
submission.shape

(66168, 2)

In [None]:
# store
submission.to_csv('/content/drive/MyDrive/BDML-2024/P-Set2/submissions/nn_final.csv', index=False)

In [None]:
test.shape

(66168, 30)

# 8. Submission

In [None]:
!kaggle competitions submit -c uniandes-bdml-2024-20-ps-2 -f /content/drive/MyDrive/BDML-2024/P-Set2/submissions/nn_final.csv -m "neural network tuned FINAL "

100% 1.70M/1.70M [00:02<00:00, 728kB/s]
400 - Bad Request - Submission not allowed:  Your team has used its daily Submission allowance (3) today, please try again tomorrow UTC (22 hours from now).


In [None]:
import os
from getpass import getpass

def set_kaggle_credentials():
    username = input("Enter your Kaggle username: ")
    key = getpass("Enter your Kaggle API key: ")
    os.environ['KAGGLE_USERNAME'] = username
    os.environ['KAGGLE_KEY'] = key
    print("Kaggle credentials set as environment variables.")

In [None]:
# Create Submission File
submission_dir = '/content/drive/MyDrive/BDML-2024/P-Set2/submissions/'
os.makedirs(submission_dir, exist_ok=True)

def create_submission(test_data, predictions, directory, filename='nn_final.csv'):
    # Construct full file path
    filepath = os.path.join(directory, filename)

    # Create the submission DataFrame and save it to the specified path
    submission = pd.DataFrame({'id': test_data['id'], 'pobre': predictions})
    submission.to_csv(filepath, index=False)
    print(f"Submission saved to {filepath}")
    return filepath  # Return the full file path

In [None]:
# 3. Submit to Kaggle
def submit_to_kaggle(file_path, message):
    competition = "uniandes-bdml-2024-20-ps-2"
    command = f"kaggle competitions submit -c {competition} -f {file_path} -m \"{message}\""
    print(f"Submitting to Kaggle with command: {command}")
    os.system(command)

In [None]:
# 4. Main Execution Flow
submission_filename = 'nn_final.csv'
submission_file_path = create_submission(test, predictions, submission_dir, submission_filename)
print(f"Submission file path: {submission_file_path}")

# Set credentials and submit to Kaggle
set_kaggle_credentials()
submit_to_kaggle(submission_file_path, "neural network with tuned hyperparameters FINAL")

print("Submission process completed!")

Submission saved to /content/drive/MyDrive/BDML-2024/P-Set2/submissions/nn_final.csv
Submission file path: /content/drive/MyDrive/BDML-2024/P-Set2/submissions/nn_final.csv
Enter your Kaggle username: edmundoariasdeabreu
Enter your Kaggle API key: ··········
Kaggle credentials set as environment variables.
Submitting to Kaggle with command: kaggle competitions submit -c uniandes-bdml-2024-20-ps-2 -f /content/drive/MyDrive/BDML-2024/P-Set2/submissions/nn_final.csv -m "neural network with tuned hyperparameters FINAL"
Submission process completed!
