## üåê **Google Drive Connection**

In [70]:
# Install and configure Kaggle API
!pip install -q kaggle

from google.colab import files
print("Carica il file kaggle.json (scaricabile dal tuo profilo Kaggle)")
files.upload()  # select kaggle.json from your pc, using the API from user->setting in KAGGLE (create new token)

# Configura le credenziali
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Monta Google Drive
from google.colab import drive
drive.mount("/gdrive")

# Percorso di destinazione sul Drive
dataset_path = "/gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset"
!mkdir -p {dataset_path}

# Scarica il dataset direttamente da Kaggle nella cartella scelta
!kaggle competitions download -c an2dl2526c1 -p {dataset_path}

# Decomprimi (opzionale, solo se i file sono .zip)
!unzip -o {dataset_path}/an2dl2526c1.zip -d {dataset_path}

print(f"Dataset scaricato e disponibile in: {dataset_path}")


Carica il file kaggle.json (scaricabile dal tuo profilo Kaggle)


Saving kaggle.json to kaggle.json
Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
an2dl2526c1.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  /gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset/an2dl2526c1.zip
  inflating: /gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset/pirate_pain_test.csv  
  inflating: /gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset/pirate_pain_train.csv  
  inflating: /gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset/pirate_pain_train_labels.csv  
  inflating: /gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset/sample_submission.csv  
Dataset scaricato e disponibile in: /gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset


## ‚öôÔ∏è **Libraries Import**

In [71]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import logging
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
# from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
logs_dir = "tensorboard"
!pkill -f tensorboard
%load_ext tensorboard
!mkdir -p models

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import copy
import shutil
from itertools import product
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from scipy.stats import skew

# Profiling library
!pip install -U ydata-profiling
from ydata_profiling import ProfileReport

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
PyTorch version: 2.8.0+cu126
Device: cpu


## ‚è≥ **Data Loading**

In [72]:
dataset_path = "/gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset"
os.makedirs(dataset_path, exist_ok=True)

competition = "an2dl2526c1"
# file needed
dataset_files = ["pirate_pain_test.csv", "pirate_pain_train.csv", "pirate_pain_train_labels.csv"]

# Download file if not done yet
for fname in dataset_files:
    dest_path = os.path.join(dataset_path, fname)
    if not os.path.exists(dest_path):
        print(f"üì• Downloading {fname} ...")
        !kaggle competitions download -c {competition} -f {fname} -p {dataset_path}
    else:
        print(f"‚úÖ {fname} already exists. Using cached version.")

print("\nAll files ready in:", dataset_path)


‚úÖ pirate_pain_test.csv already exists. Using cached version.
‚úÖ pirate_pain_train.csv already exists. Using cached version.
‚úÖ pirate_pain_train_labels.csv already exists. Using cached version.

All files ready in: /gdrive/MyDrive/Artificial_Neural_Networks/Timeseries_Classification_Challenge/dataset


In [89]:
df_training = pd.read_csv(os.path.join(dataset_path, "pirate_pain_train.csv"))
df_labels = pd.read_csv(os.path.join(dataset_path, "pirate_pain_train_labels.csv"))

## üîé **Data exploration and profiling**

In [None]:
data_profile = ProfileReport(df_training, title="Profiling Report")
data_profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/32 [00:00<?, ?it/s][A
  9%|‚ñâ         | 3/32 [00:00<00:06,  4.43it/s][A
 12%|‚ñà‚ñé        | 4/32 [00:00<00:05,  5.02it/s][A
 16%|‚ñà‚ñå        | 5/32 [00:01<00:08,  3.13it/s][A
 19%|‚ñà‚ñâ        | 6/32 [00:03<00:14,  1.75it/s]


KeyboardInterrupt: 

## üßπ **Data cleaning**

In [90]:
# Type conversion
joint_cols = [col for col in df_training.columns if col.startswith("joint_")]
for col in joint_cols:
    df_training[col] = pd.to_numeric(df_training[col], errors='coerce').astype('float32')

for col in ['pain_survey_1','pain_survey_2','pain_survey_3','pain_survey_4']:
    df_training[col] = df_training[col].astype('float32')

df_train['is_injured'] = np.where(df_train['n_legs'] != 'two', 1, 0).astype('float32')

# Drop useless columns
cols_to_drop = [
    'joint_30',     #constant
    'n_legs', 'n_hands', 'n_eyes',  # replaced by injury_level
    'joint_08', 'joint_09', 'joint_10', 'joint_11', 'joint_12'  # unique values
]
df_training = df_training.drop(columns=[c for c in cols_to_drop if c in df_training.columns])

# Correction of skew distributions and scaling
num_cols = df_training.select_dtypes(include=[np.number]).columns
normal_cols = []
transformable_cols = []
extreme_cols = []

for col in num_cols:
    s = skew(df_training[col].dropna())
    q1 = df_training[col].quantile(0.01)
    q99 = df_training[col].quantile(0.99)
    perc_outliers = ((df_training[col] < q1) | (df_training[col] > q99)).mean() * 100

    if abs(s) < 2:
        normal_cols.append(col)
    elif 2 <= abs(s) <= 10:
        transformable_cols.append(col)
    else:
        extreme_cols.append(col)

# Log-transform + clipping to handle extreme outliers
for col in transformable_cols + extreme_cols:
    df_training[col] = np.log1p(df_training[col] - df_training[col].min() + 1e-6)
    q_low = df_training[col].quantile(0.01)
    q_high = df_training[col].quantile(0.99)
    df_training[col] = df_training[col].clip(q_low, q_high)

# Robust scaling
scaler = RobustScaler()
df_training[num_cols] = scaler.fit_transform(df_training[num_cols])

In [85]:
# Correlation matrix
corr_matrix = df_training.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

threshold = 0.9

# Drop higly correlated features (corr>0.9)
to_drop_corr = [col for col in upper.columns if any(upper[col] > threshold)]
df_training = df_training.drop(columns=to_drop_corr)

'"\n# List of correlated features\ncorrelated_pairs = [(col1, col2, upper.loc[col1, col2])\n                    for col1 in upper.columns\n                    for col2 in upper.columns\n                    if upper.loc[col1, col2] > threshold]\n\ncorrelated_pairs = sorted(correlated_pairs, key=lambda x: x[2], reverse=True)\n\nfor col1, col2, corr_val in correlated_pairs:\n    print(f"{col1} ‚Üî {col2}: corr={corr_val:.2f}")\n\n'