# HW_5

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the datasets
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print("Data loaded successfully.")
    # Display basic info about the dataframes
    print("\nTraining data info:")
    train_df.info()
    print("\nFirst 5 rows of training data:")
    print(train_df.head())
    
    print("\nTest data info:")
    test_df.info()
    print("\nFirst 5 rows of test data:")
    print(test_df.head())

    # Check for missing values
    print("\nMissing values in training data:")
    print(train_df.isnull().sum().sum()) # Check total missing values
    print("\nMissing values in test data:")
    print(test_df.isnull().sum().sum()) # Check total missing values

except FileNotFoundError:
    print("Error: train.csv or test.csv not found. Make sure the files are in the correct directory.")
    # Create dummy dataframes for the rest of the script to run without errors
    # In a real scenario, you would stop execution here or handle the error appropriately.
    train_df = pd.DataFrame(np.random.rand(100, 10), columns=[f'feature_{i}' for i in range(9)] + ['target'])
    train_df['target'] = np.random.randint(0, 2, 100)
    test_df = pd.DataFrame(np.random.rand(50, 9), columns=[f'feature_{i}' for i in range(9)])
    print("\n--- Created dummy dataframes for demonstration ---")
    
# Prepare the data
# Assuming the last column in train.csv is the target and the rest are features.
# Assuming test.csv has the same feature columns as train.csv (excluding the target).

if 'target' in train_df.columns:
    X_train_full = train_df.drop('target', axis=1)
    y_train_full = train_df['target']
    X_test = test_df 
    
    # Verify columns match (excluding target)
    if list(X_train_full.columns) == list(X_test.columns):
        print("\nTrain and test columns match.")
        
        # --- Preprocessing ---
        # Scale numerical features. Assuming all features are numerical.
        # If there were categorical features, they would need encoding first.
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_full)
        X_test_scaled = scaler.transform(X_test) # Use the scaler fitted on training data
        
        print(f"\nData scaled. Training features shape: {X_train_scaled.shape}, Test features shape: {X_test_scaled.shape}")

        # --- Model Selection and Training ---
        # Let's use a robust model like RandomForestClassifier
        model = RandomForestClassifier(n_estimators=150, # Increased estimators for potentially better performance
                                       random_state=42, 
                                       n_jobs=-1,      # Use all available CPU cores
                                       max_depth=20,   # Limit depth to prevent overfitting
                                       min_samples_leaf=3) # Require more samples per leaf

        # --- Cross-validation (Optional but recommended) ---
        # Evaluate model performance on the training set using cross-validation
        # to get a better estimate of its generalization ability.
        print("\nPerforming cross-validation...")
        cv_scores = cross_val_score(model, X_train_scaled, y_train_full, cv=5, scoring='accuracy', n_jobs=-1)
        print(f"Cross-validation accuracy scores: {cv_scores}")
        mean_cv_accuracy = np.mean(cv_scores)
        print(f"Mean cross-validation accuracy: {mean_cv_accuracy:.4f}")

        # --- Final Model Training ---
        # Train the model on the entire training dataset
        print("\nTraining the final model on the full training data...")
        model.fit(X_train_scaled, y_train_full)
        print("Model training complete.")

        # --- Prediction ---
        # Make predictions on the scaled test data
        print("\nMaking predictions on the test data...")
        predictions = model.predict(X_test_scaled)
        print("Predictions made.")
        print(f"Example predictions: {predictions[:10]}") # Show first 10 predictions

        # --- Output Generation ---
        # Create the answers.csv file as required
        output_df = pd.DataFrame(predictions, columns=None) # No header, single column
        
        output_filename = 'answers.csv'
        try:
            output_df.to_csv(output_filename, index=False, header=False)
            print(f"\nPredictions saved to {output_filename}")
            
            # Verification message based on CV score and target accuracy range
            print(f"\nEstimated model accuracy (from CV): {mean_cv_accuracy:.4f}")
            if mean_cv_accuracy < 0.865:
                 print("CV Accuracy is below the minimum threshold (86.5%). Consider model tuning or different models.")
            elif mean_cv_accuracy >= 0.865 and mean_cv_accuracy <= 0.915:
                 print("CV Accuracy is within the target range (86.5% - 91.5%). Good!")
            else: # mean_cv_accuracy > 0.915
                 # The prompt phrasing "целевая точность модели (accuracy) должна быть ниже 91.5%" is unusual.
                 # Usually, higher accuracy is better. Assuming it might mean "full score requires > 91.5%"
                 # OR it might be a very specific constraint. 
                 # Let's provide feedback based on the literal (unusual) interpretation first,
                 # and the likely intended interpretation.
                 print(f"CV Accuracy ({mean_cv_accuracy:.4f}) exceeds the specified upper limit of 91.5%.")
                 print("Based on the literal requirement ('below 91.5%'), this might be too high.")
                 print("However, it's more likely the goal is *at least* 91.5% for full score. Check task clarification if possible.")
                 print("If the goal truly is <91.5%, you might need to simplify the model (e.g., reduce n_estimators/max_depth) or use more regularization.")

        except Exception as e:
            print(f"\nError saving predictions to {output_filename}: {e}")

    else:
        print("\nError: Training and test feature columns do not match.")
        print(f"Training columns: {list(X_train_full.columns)}")
        print(f"Test columns: {list(X_test.columns)}")
else:
    print("\nError: 'target' column not found in train.csv.")

Data loaded successfully.

Training data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1288 entries, 0 to 1287
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   area           1288 non-null   int64  
 1   perimeter      1288 non-null   float64
 2   major_axis     1288 non-null   float64
 3   minor_axis     1288 non-null   float64
 4   eccentricity   1288 non-null   float64
 5   eqdiasq        1288 non-null   float64
 6   solidity       1288 non-null   float64
 7   convex_area    1288 non-null   int64  
 8   extent         1288 non-null   float64
 9   aspect_ratio   1288 non-null   float64
 10  roundness      1288 non-null   float64
 11  compactness    1288 non-null   float64
 12  shapefactor_1  1288 non-null   float64
 13  shapefactor_2  1288 non-null   float64
 14  shapefactor_3  1288 non-null   float64
 15  shapefactor_4  1288 non-null   float64
 16  target         1288 non-null   int64  
dtypes: fl

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [10, 20, 30, None], # None means nodes expand until pure
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2', None] # None means max_features=n_features
}

# Initialize the base model
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Initialize GridSearchCV
# cv=5 means 5-fold cross-validation
# scoring='accuracy' specifies the metric to optimize
# n_jobs=-1 uses all available cores
# verbose=2 shows progress
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, 
                            scoring='accuracy', n_jobs=-1, verbose=2)

# Fit GridSearchCV - this will take time!
print("\nStarting GridSearchCV for RandomForestClassifier...")
grid_search.fit(X_train_scaled, y_train_full)

# Get the best parameters and the best score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Use the best estimator found by GridSearchCV for final training and prediction
best_model = grid_search.best_estimator_ 

# --- Retrain on full data (optional but good practice) ---
# best_model.fit(X_train_scaled, y_train_full) 
# Note: GridSearchCV with refit=True (default) already retrains the best model on the whole dataset provided to .fit()

# --- Prediction ---
print("\nMaking predictions with the tuned model...")
predictions = best_model.predict(X_test_scaled) 

# --- Output Generation ---
output_df = pd.DataFrame(predictions, columns=None)
output_filename = 'answers_tuned.csv' # Save with a new name
output_df.to_csv(output_filename, index=False, header=False)
print(f"\nTuned predictions saved to {output_filename}")
# You would then rename this to answers.csv for submission


Starting GridSearchCV for RandomForestClassifier...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best parameters found: {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 250}
Best cross-validation accuracy: 0.8696

Making predictions with the tuned model...

Tuned predictions saved to answers_tuned.csv


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb # Import XGBoost
import numpy as np
import warnings 
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output

# --- Load Data (Assuming files are still accessible) ---
# It's good practice to reload or ensure data is in the expected state
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: train.csv or test.csv not found.")
    # Handle error or exit if necessary
    exit() # Exit if data isn't available

# --- Prepare Data ---
if 'target' in train_df.columns and all(col in train_df.columns for col in test_df.columns):
    X_train_full = train_df.drop('target', axis=1)
    y_train_full = train_df['target']
    X_test = test_df

    # Ensure columns match (excluding target)
    if list(X_train_full.columns) == list(X_test.columns):
        print("Train and test columns match.")
        
        # --- Preprocessing (Scaling) ---
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_full)
        X_test_scaled = scaler.transform(X_test)
        print(f"Data scaled. Training features shape: {X_train_scaled.shape}, Test features shape: {X_test_scaled.shape}")

        # --- XGBoost Model Tuning ---
        
        # Define the parameter grid for XGBoost
        # Note: This grid is smaller than the RF one to reduce runtime, 
        # you might expand it for more thorough search.
        param_grid_xgb = {
            'n_estimators': [100, 200, 300],       # Number of boosting rounds
            'learning_rate': [0.01, 0.1, 0.2],   # Step size shrinkage
            'max_depth': [3, 5, 7],              # Maximum depth of a tree
            'subsample': [0.7, 0.8, 1.0],        # Fraction of samples used per tree
            'colsample_bytree': [0.7, 0.8, 1.0]  # Fraction of features used per tree
        }

        # Initialize XGBoost Classifier
        # Use 'objective': 'binary:logistic' for binary classification
        # 'eval_metric': 'logloss' or 'auc' are common for evaluation during training
        # 'use_label_encoder=False' is recommended to avoid deprecation warnings
        xgb_model = xgb.XGBClassifier(objective='binary:logistic', 
                                      eval_metric='logloss', 
                                      use_label_encoder=False, 
                                      random_state=42)

        # Initialize GridSearchCV
        grid_search_xgb = GridSearchCV(estimator=xgb_model, 
                                       param_grid=param_grid_xgb, 
                                       cv=5,                  # 5-fold cross-validation
                                       scoring='accuracy',    # Optimize for accuracy
                                       n_jobs=-1,             # Use all CPU cores
                                       verbose=2)             # Show progress

        # Fit GridSearchCV - This might take a while!
        print("\nStarting GridSearchCV for XGBoostClassifier...")
        grid_search_xgb.fit(X_train_scaled, y_train_full)

        # Get the best parameters and the best score
        print(f"Best parameters found for XGBoost: {grid_search_xgb.best_params_}")
        best_xgb_score = grid_search_xgb.best_score_
        print(f"Best cross-validation accuracy (XGBoost): {best_xgb_score:.4f}")

        # Use the best estimator found by GridSearchCV
        best_xgb_model = grid_search_xgb.best_estimator_

        # --- Prediction ---
        print("\nMaking predictions with the tuned XGBoost model...")
        predictions_xgb = best_xgb_model.predict(X_test_scaled)
        # XGBoost predict often returns probabilities by default if not configured, 
        # but for binary:logistic, .predict() should give 0/1. Let's ensure.
        # If it returned probabilities, you'd use: 
        # predictions_xgb = (best_xgb_model.predict_proba(X_test_scaled)[:,1] >= 0.5).astype(int)

        # --- Output Generation ---
        output_df_xgb = pd.DataFrame(predictions_xgb, columns=None) # No header, single column
        output_filename_xgb = 'answers_xgboost.csv'
        try:
            output_df_xgb.to_csv(output_filename_xgb, index=False, header=False)
            print(f"\nXGBoost predictions saved to {output_filename_xgb}")
            
            # --- Feedback ---
            print(f"\nEstimated model accuracy (XGBoost CV): {best_xgb_score:.4f}")
            if best_xgb_score < 0.865:
                 print("XGBoost CV Accuracy is below the minimum threshold (86.5%).")
            elif best_xgb_score >= 0.865 and best_xgb_score <= 0.915:
                 print("XGBoost CV Accuracy is within the target range (86.5% - 91.5%). Good!")
            else: # best_xgb_score > 0.915
                 print(f"XGBoost CV Accuracy ({best_xgb_score:.4f}) meets or exceeds the 91.5% target. Excellent!")
                 print("This model might achieve the full score.")

        except Exception as e:
            print(f"\nError saving XGBoost predictions to {output_filename_xgb}: {e}")

    else:
        print("\nError: Training and test feature columns do not match after reloading.")
else:
    print("\nError: 'target' column not found in train.csv or columns mismatch after reloading.")



Data loaded successfully.
Train and test columns match.
Data scaled. Training features shape: (1288, 16), Test features shape: (430, 16)

Starting GridSearchCV for XGBoostClassifier...
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters found for XGBoost: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best cross-validation accuracy (XGBoost): 0.8734

Making predictions with the tuned XGBoost model...

XGBoost predictions saved to answers_xgboost.csv

Estimated model accuracy (XGBoost CV): 0.8734
XGBoost CV Accuracy is within the target range (86.5% - 91.5%). Good!


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings 
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output

# --- Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
BATCH_SIZE = 64
EPOCHS = 100 # Increased epochs, consider adding early stopping
LEARNING_RATE = 0.001
TEST_SPLIT_SIZE = 0.2 # Hold out 20% of training data for validation
RANDOM_SEED = 42

# --- Set Seed for Reproducibility ---
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# --- Load Data ---
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: train.csv or test.csv not found.")
    exit()

# --- Prepare Data ---
if 'target' in train_df.columns and all(col in train_df.columns for col in test_df.columns):
    X = train_df.drop('target', axis=1)
    y = train_df['target']
    X_test_final = test_df

    if list(X.columns) == list(X_test_final.columns):
        print("Train and test columns match.")
        
        # --- Preprocessing (Scaling) ---
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_test_scaled = scaler.transform(X_test_final) 
        
        # --- Train/Validation Split ---
        X_train, X_val, y_train, y_val = train_test_split(
            X_scaled, y.values, test_size=TEST_SPLIT_SIZE, random_state=RANDOM_SEED, stratify=y
        )
        
        print(f"Data scaled and split.")
        print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test_scaled.shape}")

        # --- PyTorch Dataset ---
        class DroneDataset(Dataset):
            def __init__(self, features, labels=None):
                # Convert to float32, the default float type for PyTorch
                self.features = torch.tensor(features, dtype=torch.float32)
                # Ensure labels are also float32 for BCELoss and reshape to [n_samples, 1]
                self.labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1) if labels is not None else None

            def __len__(self):
                return len(self.features)

            def __getitem__(self, idx):
                if self.labels is not None:
                    return self.features[idx], self.labels[idx]
                else:
                    # Return only features if labels are not provided (for test set)
                    return self.features[idx]

        # Create Datasets
        train_dataset = DroneDataset(X_train, y_train)
        val_dataset = DroneDataset(X_val, y_val)
        test_dataset = DroneDataset(X_test_scaled) # No labels for the final test set

        # Create DataLoaders
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False) # No need to shuffle test data

        # --- Neural Network Model ---
        class SimpleNN(nn.Module):
            def __init__(self, input_dim):
                super(SimpleNN, self).__init__()
                self.layer_1 = nn.Linear(input_dim, 128) # First hidden layer
                self.relu_1 = nn.ReLU()
                self.dropout_1 = nn.Dropout(0.3) # Add dropout for regularization
                self.layer_2 = nn.Linear(128, 256) 
                self.relu_2 = nn.ReLU()
                self.dropout_2 = nn.Dropout(0.3) # Add dropout for regularization
                self.layer_4 = nn.Linear(256, 64)   # Second hidden layer
                self.relu_4 = nn.ReLU()
                self.dropout_4 = nn.Dropout(0.2) # Add dropout
                self.output_layer = nn.Linear(64, 1) # Output layer (1 neuron for binary classification)
                self.sigmoid = nn.Sigmoid()         # Sigmoid activation for probability output

            def forward(self, x):
                x = self.layer_1(x)
                x = self.relu_1(x)
                x = self.dropout_1(x)
                x = self.layer_2(x)
                x = self.relu_2(x)
                x = self.dropout_2(x)
                x = self.layer_4(x)
                x = self.relu_4(x)
                x = self.dropout_4(x)
                x = self.output_layer(x)
                x = self.sigmoid(x)
                return x

        # Instantiate the model
        input_dimension = X_train.shape[1] # Number of features
        model = SimpleNN(input_dimension).to(DEVICE)
        print("\nModel Architecture:")
        print(model)

        # --- Loss Function and Optimizer ---
        criterion = nn.BCELoss() # Binary Cross Entropy Loss for binary classification
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        # --- Training Loop ---
        best_val_accuracy = 0.0
        print("\nStarting Training...")
        for epoch in range(EPOCHS):
            model.train() # Set model to training mode
            train_loss = 0.0
            correct_train = 0
            total_train = 0
            
            for features, labels in train_loader:
                features, labels = features.to(DEVICE), labels.to(DEVICE)

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = model(features)
                loss = criterion(outputs, labels)

                # Backward pass and optimize
                loss.backward()
                optimizer.step()

                # Track loss and accuracy
                train_loss += loss.item() * features.size(0)
                predicted = (outputs > 0.5).float() # Convert probabilities to 0/1
                total_train += labels.size(0)
                correct_train += (predicted == labels).sum().item()

            epoch_train_loss = train_loss / len(train_loader.dataset)
            epoch_train_acc = correct_train / total_train

            # --- Validation Step ---
            model.eval() # Set model to evaluation mode
            val_loss = 0.0
            correct_val = 0
            total_val = 0
            with torch.no_grad(): # Disable gradient calculation for validation
                for features, labels in val_loader:
                    features, labels = features.to(DEVICE), labels.to(DEVICE)
                    outputs = model(features)
                    loss = criterion(outputs, labels)
                    
                    val_loss += loss.item() * features.size(0)
                    predicted = (outputs > 0.5).float()
                    total_val += labels.size(0)
                    correct_val += (predicted == labels).sum().item()

            epoch_val_loss = val_loss / len(val_loader.dataset)
            epoch_val_acc = correct_val / total_val

            print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {epoch_train_loss:.4f} | Train Acc: {epoch_train_acc:.4f} | Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_val_acc:.4f}")

            # Simple check to save the best model based on validation accuracy
            # In practice, you might want early stopping based on val_loss
            if epoch_val_acc > best_val_accuracy:
                best_val_accuracy = epoch_val_acc
                # Optional: Save the best model state
                # torch.save(model.state_dict(), 'best_nn_model.pth')
                # print(f"--- Best validation accuracy improved to {best_val_accuracy:.4f} ---")


        print("\nTraining Finished.")
        print(f"Best Validation Accuracy during training: {best_val_accuracy:.4f}")

        # --- Prediction on Final Test Set ---
        model.eval() # Ensure model is in evaluation mode
        all_predictions = []
        with torch.no_grad():
            for features in test_loader:
                features = features.to(DEVICE)
                outputs = model(features)
                predicted_probs = outputs.cpu().numpy() # Move to CPU and convert to numpy
                predicted_classes = (predicted_probs > 0.5).astype(int) # Threshold at 0.5
                all_predictions.extend(predicted_classes.flatten()) # Flatten in case of single-column output

        print("\nMaking predictions with the Neural Network model...")
        
        # --- Output Generation ---
        output_df_nn = pd.DataFrame(all_predictions, columns=None)
        output_filename_nn = 'answers_nn.csv'
        try:
            output_df_nn.to_csv(output_filename_nn, index=False, header=False)
            print(f"\nNeural Network predictions saved to {output_filename_nn}")

            # --- Feedback based on best validation accuracy ---
            print(f"\nEstimated model accuracy (NN Best Validation): {best_val_accuracy:.4f}")
            if best_val_accuracy < 0.865:
                 print("NN Validation Accuracy is below the minimum threshold (86.5%).")
            elif best_val_accuracy >= 0.865 and best_val_accuracy <= 0.915:
                 print("NN Validation Accuracy is within the target range (86.5% - 91.5%). Good!")
            else: # best_val_accuracy > 0.915
                 print(f"NN Validation Accuracy ({best_val_accuracy:.4f}) meets or exceeds the 91.5% target. Potentially Excellent!")
                 print("This model might achieve the full score. Submit answers_nn.csv to check.")

        except Exception as e:
            print(f"\nError saving Neural Network predictions to {output_filename_nn}: {e}")

    else:
        print("\nError: Training and test feature columns do not match.")
else:
    print("\nError: 'target' column not found in train.csv or columns mismatch.")


Using device: cuda
Data loaded successfully.
Train and test columns match.
Data scaled and split.
Train shape: (1030, 16), Validation shape: (258, 16), Test shape: (430, 16)

Model Architecture:
SimpleNN(
  (layer_1): Linear(in_features=16, out_features=128, bias=True)
  (relu_1): ReLU()
  (dropout_1): Dropout(p=0.3, inplace=False)
  (layer_2): Linear(in_features=128, out_features=256, bias=True)
  (relu_2): ReLU()
  (dropout_2): Dropout(p=0.3, inplace=False)
  (layer_4): Linear(in_features=256, out_features=64, bias=True)
  (relu_4): ReLU()
  (dropout_4): Dropout(p=0.2, inplace=False)
  (output_layer): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

Starting Training...
Epoch 1/100 | Train Loss: 0.5369 | Train Acc: 0.7971 | Val Loss: 0.3617 | Val Acc: 0.8411
Epoch 2/100 | Train Loss: 0.3512 | Train Acc: 0.8612 | Val Loss: 0.3379 | Val Acc: 0.8682
Epoch 3/100 | Train Loss: 0.3434 | Train Acc: 0.8573 | Val Loss: 0.3278 | Val Acc: 0.8682
Epoch 4/100 | Train Lo

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau # Import Learning Rate Scheduler
import warnings 
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output
import copy # To save the best model state

# --- Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
BATCH_SIZE = 64
EPOCHS = 500 # Increased epochs significantly, relying on early stopping
LEARNING_RATE = 0.001
TEST_SPLIT_SIZE = 0.2 # Hold out 20% of training data for validation
RANDOM_SEED = 42
EARLY_STOPPING_PATIENCE = 40 # Number of epochs to wait for validation loss improvement
LR_SCHEDULER_PATIENCE = 30 # Number of epochs with no improvement after which learning rate will be reduced
LR_SCHEDULER_FACTOR = 0.5 # Factor by which the learning rate will be reduced

# --- Set Seed for Reproducibility ---
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# --- Load Data ---
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: train.csv or test.csv not found.")
    exit()

# --- Prepare Data ---
if 'target' in train_df.columns and all(col in train_df.columns for col in test_df.columns):
    X = train_df.drop('target', axis=1)
    y = train_df['target']
    X_test_final = test_df

    if list(X.columns) == list(X_test_final.columns):
        print("Train and test columns match.")
        
        # --- Preprocessing (Scaling) ---
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_test_scaled = scaler.transform(X_test_final) 
        
        # --- Train/Validation Split ---
        X_train, X_val, y_train, y_val = train_test_split(
            X_scaled, y.values, test_size=TEST_SPLIT_SIZE, random_state=RANDOM_SEED, stratify=y
        )
        
        print(f"Data scaled and split.")
        print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test_scaled.shape}")

        # --- PyTorch Dataset ---
        class DroneDataset(Dataset):
            def __init__(self, features, labels=None):
                # Convert to float32, the default float type for PyTorch
                self.features = torch.tensor(features, dtype=torch.float32)
                # Ensure labels are also float32 for BCELoss and reshape to [n_samples, 1]
                self.labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1) if labels is not None else None

            def __len__(self):
                return len(self.features)

            def __getitem__(self, idx):
                if self.labels is not None:
                    return self.features[idx], self.labels[idx]
                else:
                    # Return only features if labels are not provided (for test set)
                    return self.features[idx]

        # Create Datasets
        train_dataset = DroneDataset(X_train, y_train)
        val_dataset = DroneDataset(X_val, y_val)
        test_dataset = DroneDataset(X_test_scaled) # No labels for the final test set

        # Create DataLoaders
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False) # No need to shuffle test data

        # --- Neural Network Model ---
        class SimpleNN(nn.Module):
            def __init__(self, input_dim):
                super(SimpleNN, self).__init__()
                # Slightly increased neurons and added one more layer
                self.layer_1 = nn.Linear(input_dim, 256) # First hidden layer
                self.relu_1 = nn.ReLU()
                self.dropout_1 = nn.Dropout(0.4) # Slightly increased dropout
                self.layer_2 = nn.Linear(256, 128)   # Second hidden layer
                self.relu_2 = nn.ReLU()
                self.dropout_2 = nn.Dropout(0.4) # Slightly increased dropout
                self.layer_3 = nn.Linear(128, 64)   # Third hidden layer
                self.relu_3 = nn.ReLU()
                self.dropout_3 = nn.Dropout(0.3) # Dropout for the new layer
                self.output_layer = nn.Linear(64, 1) # Output layer
                self.sigmoid = nn.Sigmoid()         # Sigmoid activation

            def forward(self, x):
                x = self.layer_1(x)
                x = self.relu_1(x)
                x = self.dropout_1(x)
                x = self.layer_2(x)
                x = self.relu_2(x)
                x = self.dropout_2(x)
                x = self.layer_3(x)
                x = self.relu_3(x)
                x = self.dropout_3(x)
                x = self.output_layer(x)
                x = self.sigmoid(x)
                return x

        # Instantiate the model
        input_dimension = X_train.shape[1] # Number of features
        model = SimpleNN(input_dimension).to(DEVICE)
        print("Model Architecture:")
        print(model)

        # --- Loss Function and Optimizer ---
        criterion = nn.BCELoss() # Binary Cross Entropy Loss
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        # Learning Rate Scheduler
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=LR_SCHEDULER_FACTOR, patience=LR_SCHEDULER_PATIENCE, verbose=True)


        # --- Training Loop ---
        best_val_loss = float('inf') # Initialize with infinity for early stopping
        epochs_no_improve = 0 # Counter for early stopping
        best_model_state = None # To store the state of the best model

        print("Starting Training...")
        for epoch in range(EPOCHS):
            model.train() # Set model to training mode
            train_loss = 0.0
            correct_train = 0
            total_train = 0
            
            for features, labels in train_loader:
                features, labels = features.to(DEVICE), labels.to(DEVICE)

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = model(features)
                loss = criterion(outputs, labels)

                # Backward pass and optimize
                loss.backward()
                optimizer.step()

                # Track loss and accuracy
                train_loss += loss.item() * features.size(0)
                predicted = (outputs > 0.5).float() # Convert probabilities to 0/1
                total_train += labels.size(0)
                correct_train += (predicted == labels).sum().item()

            epoch_train_loss = train_loss / len(train_loader.dataset)
            epoch_train_acc = correct_train / total_train

            # --- Validation Step ---
            model.eval() # Set model to evaluation mode
            val_loss = 0.0
            correct_val = 0
            total_val = 0
            with torch.no_grad(): # Disable gradient calculation for validation
                for features, labels in val_loader:
                    features, labels = features.to(DEVICE), labels.to(DEVICE)
                    outputs = model(features)
                    loss = criterion(outputs, labels)
                    
                    val_loss += loss.item() * features.size(0)
                    predicted = (outputs > 0.5).float()
                    total_val += labels.size(0)
                    correct_val += (predicted == labels).sum().item()

            epoch_val_loss = val_loss / len(val_loader.dataset)
            epoch_val_acc = correct_val / total_val

            print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {epoch_train_loss:.4f} | Train Acc: {epoch_train_acc:.4f} | Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_val_acc:.4f}")

            # --- Learning Rate Scheduler Step ---
            scheduler.step(epoch_val_loss)

            # --- Early Stopping Check ---
            if epoch_val_loss < best_val_loss:
                best_val_loss = epoch_val_loss
                epochs_no_improve = 0
                # Save the best model state
                best_model_state = copy.deepcopy(model.state_dict())
                # print(f"--- Best validation loss improved to {best_val_loss:.4f} ---")
            else:
                epochs_no_improve += 1
                if epochs_no_improve == EARLY_STOPPING_PATIENCE:
                    print(f"Early stopping triggered after {EARLY_STOPPING_PATIENCE} epochs with no improvement in validation loss.")
                    break # Stop training loop

        print("Training Finished.")

        # Load the best model state before prediction
        if best_model_state is not None:
            model.load_state_dict(best_model_state)
            print("Loaded best model state based on validation loss.")
        else:
            print("No improvement in validation loss, using the last model state.")


        # --- Prediction on Final Test Set ---
        model.eval() # Ensure model is in evaluation mode
        all_predictions = []
        with torch.no_grad():
            for features in test_loader:
                features = features.to(DEVICE)
                outputs = model(features)
                predicted_probs = outputs.cpu().numpy() # Move to CPU and convert to numpy
                predicted_classes = (predicted_probs > 0.5).astype(int) # Threshold at 0.5
                all_predictions.extend(predicted_classes.flatten()) # Flatten in case of single-column output

        print("Making predictions with the Neural Network model...")
        
        # --- Output Generation ---
        output_df_nn = pd.DataFrame(all_predictions, columns=None)
        output_filename_nn = 'answers.csv' # Changed filename to 'answers.csv' as required
        try:
            output_df_nn.to_csv(output_filename_nn, index=False, header=False)
            print(f"Neural Network predictions saved to {output_filename_nn}")

            # --- Feedback based on best validation loss (proxy for generalization) ---
            # Note: We don't have the true test accuracy here, but validation loss is a good indicator
            print("Validation metrics from the epoch with the best validation loss:")
            # You would need to track best validation accuracy alongside best validation loss
            # For simplicity here, we just report the best loss and remind about the target
            print(f"Best Validation Loss achieved: {best_val_loss:.4f}")
            print("Goal is to achieve accuracy between 86.5% and 91.5% on the unseen test set.")
            print(f"Your previous score suggests you are very close to the target range.")
            print(f"Try submitting the '{output_filename_nn}' file generated by this code.")


        except Exception as e:
            print(f"Error saving Neural Network predictions to {output_filename_nn}: {e}")

    else:
        print("Error: Training and test feature columns do not match.")
else:
    print("Error: 'target' column not found in train.csv or columns mismatch.")

Using device: cuda
Data loaded successfully.
Train and test columns match.
Data scaled and split.
Train shape: (1030, 16), Validation shape: (258, 16), Test shape: (430, 16)
Model Architecture:
SimpleNN(
  (layer_1): Linear(in_features=16, out_features=256, bias=True)
  (relu_1): ReLU()
  (dropout_1): Dropout(p=0.4, inplace=False)
  (layer_2): Linear(in_features=256, out_features=128, bias=True)
  (relu_2): ReLU()
  (dropout_2): Dropout(p=0.4, inplace=False)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (relu_3): ReLU()
  (dropout_3): Dropout(p=0.3, inplace=False)
  (output_layer): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
Starting Training...
Epoch 1/500 | Train Loss: 0.5806 | Train Acc: 0.7845 | Val Loss: 0.3965 | Val Acc: 0.8411
Epoch 2/500 | Train Loss: 0.3690 | Train Acc: 0.8505 | Val Loss: 0.3510 | Val Acc: 0.8527
Epoch 3/500 | Train Loss: 0.3416 | Train Acc: 0.8621 | Val Loss: 0.3331 | Val Acc: 0.8566
Epoch 4/500 | Train Loss

In [None]:
print(' ')

 
