# Linear Regression, Ridge, and Lasso Regression Pipeline
This notebook demonstrates a full machine learning workflow using **Linear Regression**, **Ridge Regression**, and **Lasso Regression**. We will emphasize data preprocessing and feature engineering, suitable for beginner to intermediate learners.

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Step 2: Load the datasets
train = pd.read_csv("/mnt/data/dataset/train.csv")
test = pd.read_csv("/mnt/data/dataset/test.csv")
sample = pd.read_csv("/mnt/data/dataset/sample_submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

In [None]:
# Step 3: Exploratory Data Analysis (EDA)
print("\nMissing values in train:")
print(train.isnull().sum().sort_values(ascending=False).head())

print("\nTrain data info:")
print(train.info())

# Quick statistics
print(train.describe().T)

# Correlation heatmap for numeric features
plt.figure(figsize=(10,6))
sns.heatmap(train.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Step 4: Data Preprocessing
# Separate features and target
y = train['target']  # Replace 'target' with actual target column name
X = train.drop(columns=['target'])

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numeric_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Preprocessor: scale numeric and one-hot encode categoricals
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
# Step 5: Split train/validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 6: Define models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01)
}

In [None]:
# Step 7: Train and evaluate models
results = {}

for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    r2 = r2_score(y_valid, preds)
    results[name] = {"RMSE": rmse, "R2": r2}
    print(f"{name} -> RMSE: {rmse:.4f}, R2: {r2:.4f}")

In [None]:
# Step 8: Choose best model and predict on test set
best_model_name = min(results, key=lambda x: results[x]['RMSE'])
print("\nBest model:", best_model_name)

best_model = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', models[best_model_name])])
best_model.fit(X, y)

test_preds = best_model.predict(test)

# Step 9: Prepare submission
submission = sample.copy()
submission['target'] = test_preds  # Replace 'target' with correct column name
submission.to_csv("/mnt/data/submission.csv", index=False)

print("Submission file saved!")