<a href="https://colab.research.google.com/github/Devansh-Singh09/DS/blob/main/MLassignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load training data
train_file_path = r"train.csv"
train_data = pd.read_csv(train_file_path)

# Load testing data
test_file_path = r"test.csv"
test_data = pd.read_csv(test_file_path)

# Extract features (X) and target (y) from the training data
X = train_data.drop(columns=['ID', 'medv'])  # Drop 'ID' and 'medv' (target) columns
y = train_data['medv']  # Target variable

# Extract features (X) from the test data (excluding ID column)
X_test = test_data.drop(columns=['ID'], errors='ignore')  # Drop 'ID' column

# Split the training data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Polynomial Features
poly = PolynomialFeatures(degree=2)  # Try degree 2, adjust as needed
X_train_poly = poly.fit_transform(X_train_scaled)
X_valid_poly = poly.transform(X_valid_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)
    return mse, r2

# Evaluate models
for name, model in models.items():
    print(f"Evaluating {name}...")

    if name in ['Ridge Regression', 'Lasso Regression']:
        # For regularized models, use polynomial features
        mse, r2 = evaluate_model(model, X_train_poly, y_train, X_valid_poly, y_valid)
    else:
        mse, r2 = evaluate_model(model, X_train_scaled, y_train, X_valid_scaled, y_valid)

    print(f"{name} - MSE: {mse}, R2 Score: {r2}")

# Train on the entire training data and predict on the test data with the best model (e.g., XGBoost)
best_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
best_model.fit(X_train_scaled, y_train)
y_pred_test = best_model.predict(X_test_scaled)

# Save predictions to CSV
output = pd.DataFrame({'ID': test_data['ID'], 'medv': y_pred_test})
output_file_path = r"predictions.csv"
output.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

# Perform cross-validation for XGBoost (or other models)
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-Validated MSE for XGBoost: {-cv_scores.mean()}")

Evaluating Linear Regression...
Linear Regression - MSE: 23.486735195425815, R2 Score: 0.7390315860425439
Evaluating Ridge Regression...
Ridge Regression - MSE: 19.33547159964755, R2 Score: 0.7851575659837904
Evaluating Lasso Regression...
Lasso Regression - MSE: 16.666883864120358, R2 Score: 0.8148090736561928
Evaluating Random Forest...
Random Forest - MSE: 8.115971761194029, R2 Score: 0.9098209154819099
Evaluating XGBoost...
XGBoost - MSE: 11.642994005560283, R2 Score: 0.8706310751977571
Predictions saved to predictions.csv
Cross-Validated MSE for XGBoost: 15.851098320179545
