In [4]:
# =================================================================
# 1. Import Libraries
# =================================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LassoCV, Ridge, LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# =================================================================
# 2. Load and Inspect Data
# =================================================================
# Load training and test data (update paths as needed)
train = pd.read_csv('C:/Users/diana_o0n3rtm/OneDrive/Documents/NU/DDS-8555/Assignment 3/playground-series-s4e4/train.csv')
test = pd.read_csv('C:/Users/diana_o0n3rtm/OneDrive/Documents/NU/DDS-8555/Assignment 3/playground-series-s4e4/test.csv')

# Display data structure
print("Training Data Shape:", train.shape)
print("Test Data Shape:", test.shape)
print("\nFirst 5 Training Samples:")
print(train.head())

# Check column names
print("\nTraining Data Columns:", train.columns.tolist())
print("Test Data Columns:", test.columns.tolist())

# =================================================================
# 3. Preprocess Data
# =================================================================
# Separate features and target
X_train = train.drop(['id', 'Rings'], axis=1)
y_train = train['Rings']
X_test = test.drop('id', axis=1)

# Create preprocessing pipeline:
# - One-hot encode 'Sex' (categorical)
# - Standardize numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Sex']),
        ('num', StandardScaler(), ['Length', 'Diameter', 'Height', 
         'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight'])
    ],
    remainder='passthrough'  # Keep any other columns (though none exist here)
)

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()
print("\nProcessed Feature Names:", feature_names)

# =================================================================
# 4. Model 1: Regularized Regression (Lasso + Ridge)
# =================================================================
# Step 1: Use LassoCV for feature selection
lasso_selector = LassoCV(cv=10, alphas=np.logspace(-5, 2, 100), 
                        max_iter=10000, random_state=42)
lasso_selector.fit(X_train_processed, y_train)

# Identify non-zero coefficients (selected features)
selected_features = np.where(lasso_selector.coef_ != 0)[0]
print("\nSelected Features by Lasso:")
print(feature_names[selected_features])

# Step 2: Train Ridge regression on selected features
ridge_model = Ridge(alpha=0.1)  # Small alpha for stability
ridge_model.fit(X_train_processed[:, selected_features], y_train)

# =================================================================
# 5. Model 2: Principal Component Regression (PCR)
# =================================================================
# Step 1: Perform PCA to reduce dimensionality
pca = PCA(n_components=0.95)  # Keep components explaining 95% variance
X_train_pca = pca.fit_transform(X_train_processed)  # No need for .toarray()

print(f"\nPCA Results: Reduced from {X_train_processed.shape[1]} to {pca.n_components_} components")
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Step 2: Train linear regression on principal components
pcr_model = LinearRegression()
pcr_model.fit(X_train_pca, y_train)

# =================================================================
# 6. Generate Predictions & Submissions
# =================================================================
# Predict using both models
# A. Lasso-Ridge Predictions
test_selected = X_test_processed[:, selected_features]
ridge_pred = ridge_model.predict(test_selected)

# B. PCR Predictions
X_test_pca = pca.transform(X_test_processed)
pcr_pred = pcr_model.predict(X_test_pca)

# Create submission files
pd.DataFrame({'id': test.id, 'Rings': ridge_pred}).to_csv('C:/Users/diana_o0n3rtm/OneDrive/Documents/NU/DDS-8555/Assignment 3/ridge_submission.csv', index=False)
pd.DataFrame({'id': test.id, 'Rings': pcr_pred}).to_csv('C:/Users/diana_o0n3rtm/OneDrive/Documents/NU/DDS-8555/Assignment 3/pcr_submission.csv', index=False)

print("\nSubmission files created: ridge_submission.csv, pcr_submission.csv")

Training Data Shape: (90615, 10)
Test Data Shape: (60411, 9)

First 5 Training Samples:
   id Sex  Length  Diameter  Height  Whole weight  Whole weight.1  \
0   0   F   0.550     0.430   0.150        0.7715          0.3285   
1   1   F   0.630     0.490   0.145        1.1300          0.4580   
2   2   I   0.160     0.110   0.025        0.0210          0.0055   
3   3   M   0.595     0.475   0.150        0.9145          0.3755   
4   4   I   0.555     0.425   0.130        0.7820          0.3695   

   Whole weight.2  Shell weight  Rings  
0          0.1465        0.2400     11  
1          0.2765        0.3200     11  
2          0.0030        0.0050      6  
3          0.2055        0.2500     10  
4          0.1600        0.1975      9  

Training Data Columns: ['id', 'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight', 'Rings']
Test Data Columns: ['id', 'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whol