## Multiple Linear Regression
### Data Ingestion & Initial Inspection

In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

In [73]:
df = pd.read_csv("../dataset/train.csv")
test_df = pd.read_csv("../dataset/test.csv")

In [21]:
df.shape

(74051, 10)

In [None]:
df = df.drop_duplicates(subset=['id'], keep='last')

In [3]:
df.isnull().sum()

id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64

### Preparation

In [13]:
# Separating target and predictors
X = df[['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']]
y = df['Age']

In [24]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
categorical_features = ['Sex']
numerical_features = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']

In [31]:
# Feature Transformation for Weighted_features show non-linear growth and skewnes
weight_features = ['Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']
# Log-transformer to reduce skewness and stabilize variance
# log1p handles zero values safely
log_transformer = FunctionTransformer(
    np.log1p, validate=False
)

In [None]:
# Numerical preprocessing pipeline
numerical_pipeline = Pipeline(steps=[
    # Standardize features to mean=0, std=1
    # Required because features have different scales
    ('scaler', StandardScaler())
])

In [34]:
# Categorical preprocessing pipeline
categorical_pipeline = Pipeline(steps=[
    # Converting 'Sex' into binary dummy variables
    # drop='first' avoiding dummy variable trap
    ('onehot', OneHotEncoder(drop='first'))
])

In [36]:
# Core of Leakage Prevention (ColumnTransformer)
preprocessor = ColumnTransformer(
    transformers=[
        # Log-transform only weighted-related features
        ('log_weight', log_transformer, weight_features),
        # Scaling all numerical features (including transformed ones)
        ('num', numerical_pipeline, numerical_features),
        # One-hot encode categorical feature
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='drop'
)

### Model Training

In [38]:
linear_reg_model = LinearRegression()
ridge_reg_model = Ridge() # Handles Multicollinearity

In [39]:
linear_full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', linear_reg_model)
])

In [41]:
# Fitting the data only on Training
linear_full_pipeline.fit(X_train, y_train)

In [42]:
ridge_full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', ridge_reg_model)
])

In [43]:
# Fitting the data only on Training
ridge_full_pipeline.fit(X_train, y_train)

### Model Evaluations

In [47]:
# Linear Regression Full Pipeline
# Predict on validation data
y_pred = linear_full_pipeline.predict(X_valid)

# Compute MAE
mae = mean_absolute_error(y_valid, y_pred)

print(f"Linear Regression Validation MAE: {mae:.3f}")

Linear Regression Validation MAE: 1.478


In [48]:
# Ridge Regression Full Pipeline
# Predict on validation data
y_pred_ridged = ridge_full_pipeline.predict(X_valid)

# Compute MAE
mae_ridged = mean_absolute_error(y_valid, y_pred_ridged)

print(f"Ridge Regression Validation MAE: {mae_ridged:.3f}")

Ridge Regression Validation MAE: 1.478


### Tuning the base model

In [52]:
param_grid = {
    'model__alpha': np.logspace(-3, 3, 20)
}

grid = GridSearchCV(
    ridge_full_pipeline,
    param_grid,
    scoring='neg_mean_absolute_error',
    cv=5
)

In [53]:
grid.fit(X_train, y_train)

print(-grid.best_score_)
print(grid.best_params_)

1.469937464300337
{'model__alpha': 112.88378916846884}


### Ridge Regression With Interactions

In [65]:
ridge_interaction_pipeline = Pipeline(steps=[
    # preprocessing (encoding, scaling, log transforms)
    ('preprocessing', preprocessor),
    # add interaction terms only (no squared terms)
    ('interactions', PolynomialFeatures(
        degree=2,
        interaction_only=True,
        include_bias=False
    )),
    # Ridge regression (alpha will be tuned)
    ('model', Ridge())
])

In [66]:
param_grid = {
    'model__alpha': np.logspace(-3, 3, 20)
}

grid_search = GridSearchCV(
    estimator=ridge_interaction_pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # sklearn maximizes, so negative
    cv=5,
    n_jobs=-1
)

In [67]:
grid_search.fit(X_train, y_train)

In [68]:
best_mae = -grid_search.best_score_
best_alpha = grid_search.best_params_['model__alpha']

print(f"Best CV MAE: {best_mae:.3f}")
print(f"Best alpha: {best_alpha}")

Best CV MAE: 1.422
Best alpha: 0.004281332398719396


### Retraining the Ridge Model On The Whole Training Set

In [72]:
final_model = grid_search.best_estimator_

# Fit on the full training dataset
final_model.fit(X, y)

### Testing On The Test Set

In [77]:
test_ids = test_df['id']
# Select the same predictor columns
X_test = test_df[['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']]

In [None]:
test_predictions = final_model.predict(X_test) # Output is a float array

In [80]:
submission = pd.DataFrame({
    'id': test_ids,
    'yield': test_predictions
})

In [81]:
submission.to_csv('trial_01_ridge_regression.csv', index=False)

In [27]:
df.head(2)

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
