In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.optimize import minimize

# 
data = pd.read_csv(r"C:\Users\Admin\Downloads\train.csv")

#features and target
X = data[['GrLivArea', 'YearBuilt']]
y = data['SalePrice']

# Split into train and validation sets (80/20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#

# 
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_val_scaled)
mse_lr = mean_squared_error(y_val, y_pred_lr)

# 
svr = SVR(kernel='rbf', C=100, gamma=0.1)
svr.fit(X_train_scaled, y_train)
y_pred_svr = svr.predict(X_val_scaled)
mse_svr = mean_squared_error(y_val, y_pred_svr)

# 
dt = DecisionTreeRegressor(max_depth=5, random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_val_scaled)
mse_dt = mean_squared_error(y_val, y_pred_dt)

print(f"\nIndividual Model Performance:")
print(f"- Linear Regression MSE: {mse_lr:.2f}")
print(f"- SVR MSE: {mse_svr:.2f}")
print(f"- Decision Tree MSE: {mse_dt:.2f}")

# 
y_pred_avg = (y_pred_lr + y_pred_svr + y_pred_dt) / 3
mse_avg = mean_squared_error(y_val, y_pred_avg)

# 
weights_manual = [0.4, 0.3, 0.3]  # Giving more weight to Linear Regression
y_pred_weighted = (weights_manual[0]*y_pred_lr + 
                   weights_manual[1]*y_pred_svr + 
                   weights_manual[2]*y_pred_dt)
mse_weighted = mean_squared_error(y_val, y_pred_weighted)

# 
def blending_mse(weights, preds, true):
    blended_pred = np.zeros_like(preds[0])
    for w, p in zip(weights, preds):
        blended_pred += w * p
    return mean_squared_error(true, blended_pred)

# Initial guess (equal weights)
initial_weights = [1/3, 1/3, 1/3]

# weights sum to 1 and each weight between 0 and 1
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
bounds = [(0, 1) for _ in range(3)]

# 
preds_val = [y_pred_lr, y_pred_svr, y_pred_dt]

#
result = minimize(blending_mse, initial_weights, 
                 args=(preds_val, y_val),
                 bounds=bounds,
                 constraints=constraints)

optimized_weights = result.x
y_pred_optimized = (optimized_weights[0]*y_pred_lr + 
                    optimized_weights[1]*y_pred_svr + 
                    optimized_weights[2]*y_pred_dt)
mse_optimized = mean_squared_error(y_val, y_pred_optimized)

# 

print(f"- Average Blending MSE: {mse_avg:.2f} (Improvement: {(1 - mse_avg/min(mse_lr, mse_svr, mse_dt))*100:.1f}%)")
print(f"- Weighted Blending MSE: {mse_weighted:.2f} (Improvement: {(1 - mse_weighted/min(mse_lr, mse_svr, mse_dt))*100:.1f}%)")
print(f"- Optimized Blending MSE: {mse_optimized:.2f} (Improvement: {(1 - mse_optimized/min(mse_lr, mse_svr, mse_dt))*100:.1f}%)")
print(f"  Optimized Weights: Linear={optimized_weights[0]:.3f}, SVR={optimized_weights[1]:.3f}, Tree={optimized_weights[2]:.3f}")



import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

# 

data = pd.read_csv(r"C:\Users\Admin\Downloads\train.csv")

#features and target
X = data[['GrLivArea', 'YearBuilt']]
y = data['SalePrice']

# Split into train and validation sets (80/20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 

single_tree = DecisionTreeRegressor(max_depth=5, random_state=42)
single_tree.fit(X_train_scaled, y_train)
y_pred_single = single_tree.predict(X_val_scaled)
mse_single = mean_squared_error(y_val, y_pred_single)
print(f"Single Decision Tree MSE: {mse_single:.2f}")

#Implement Bagging 


n_estimators = 10  # Number of trees in the ensemble
bootstrap_ratio = 0.8  # Size of bootstrap sample relative to original data
predictions = []

for i in range(n_estimators):
    # Create bootstrap sample (random selection with replacement)
    n_samples = int(bootstrap_ratio * len(X_train_scaled))
    indices = np.random.choice(len(X_train_scaled), size=n_samples, replace=True)
    
    X_boot = X_train_scaled[indices]
    y_boot = y_train.iloc[indices]
    
    # Train tree on bootstrap sample
    tree = DecisionTreeRegressor(
        max_depth=5,
        random_state=42 + i  # Different seed for each tree
    )
    tree.fit(X_boot, y_boot)
    
    # Store predictions on validation set
    pred = tree.predict(X_val_scaled)
    predictions.append(pred)

# Average predictions from all trees
y_pred_bagging = np.mean(predictions, axis=0)
mse_bagging = mean_squared_error(y_val, y_pred_bagging)

#
print(f"\nBagging Results (with {n_estimators} trees):")
print(f"- Bagging MSE: {mse_bagging:.2f}")
print(f"- Single Tree MSE: {mse_single:.2f}")
print(f"Improvement: {(1 - mse_bagging/mse_single)*100:.1f}%")

# 
print("\nIndividual Tree Performance on Validation Set:")
tree_mses = [mean_squared_error(y_val, pred) for pred in predictions]
print(f"- Best individual tree MSE: {min(tree_mses):.2f}")
print(f"- Worst individual tree MSE: {max(tree_mses):.2f}")
print(f"- Average individual tree MSE: {np.mean(tree_mses):.2f}")
print(f"- Bagging MSE: {mse_bagging:.2f} (better than average by {(1 - mse_bagging/np.mean(tree_mses))*100:.1f}%)")


import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# 

data = pd.read_csv(r"C:\Users\Admin\Downloads\train.csv")

#  features and target
X = data[['GrLivArea', 'YearBuilt']]
y = data['SalePrice']

# Split into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Configuration
K0 = 3  # Number of folds for Stage 0
M0 = 2  # Number of models for Stage 0
K1 = 2  # Number of folds for Stage 1
M1 = 1  # Number of models for Stage 1 

#
blend_data = np.zeros((len(X_train_scaled), M0))
blend_test = np.zeros((len(X_test_scaled), M0))

#
stage0_models = [
    ('LinearRegression', LinearRegression()),
    ('DecisionTree', DecisionTreeRegressor(max_depth=5, random_state=42))
]

# K-fold cross-validation 
kf = KFold(n_splits=K0, shuffle=True, random_state=42)

for m_idx, (m_name, model) in enumerate(stage0_models):
    print(f"\nTraining {m_name} in Stage 0...")
    test_preds = np.zeros((len(X_test_scaled), K0))
    
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
        # Split data
        X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train model
        model.fit(X_train_fold, y_train_fold)
        
        # 
        blend_data[val_idx, m_idx] = model.predict(X_val_fold)
        
        # 
        test_preds[:, fold_idx] = model.predict(X_test_scaled)
    
    # 
    blend_test[:, m_idx] = np.mean(test_preds, axis=1)

# 
meta_model = RandomForestRegressor(n_estimators=50, random_state=42)

# K-fold cross-validation 
kf1 = KFold(n_splits=K1, shuffle=True, random_state=42)
test_preds_stage1 = np.zeros((len(X_test_scaled), K1))

for fold_idx, (train_idx, val_idx) in enumerate(kf1.split(blend_data)):
    #
    X_meta_train, X_meta_val = blend_data[train_idx], blend_data[val_idx]
    y_meta_train, y_meta_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # 
    meta_model.fit(X_meta_train, y_meta_train)
    
    # 
    val_pred = meta_model.predict(X_meta_val)
    val_mse = mean_squared_error(y_meta_val, val_pred)
    print(f"Stage 1 Fold {fold_idx+1} MSE: {val_mse:.2f}")
    
    # 
    test_preds_stage1[:, fold_idx] = meta_model.predict(blend_test)

# 
final_predictions = np.mean(test_preds_stage1, axis=1)

# 
print("\nSingle Model Performance:")
for m_name, model in stage0_models:
    model.fit(X_train_scaled, y_train)
    pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, pred)
    print(f"- {m_name} MSE: {mse:.2f}")

# Stacking performance
stacking_mse = mean_squared_error(y_test, final_predictions)
print(f"\nStacking Performance:")
print(f"- Final Stacking MSE: {stacking_mse:.2f}")

# 
best_single_mse = min(
    mean_squared_error(y_test, LinearRegression().fit(X_train_scaled, y_train).predict(X_test_scaled)),
    mean_squared_error(y_test, DecisionTreeRegressor(max_depth=5, random_state=42).fit(X_train_scaled, y_train).predict(X_test_scaled))
)
print(f"\nImprovement over best single model: {(1 - stacking_mse/best_single_mse)*100:.1f}%")