## **Multivariate Regression for House Pricing Dataset**

1- Try different learning rates  
2- Try with/without feature scaling  
3- Try polynomial regression  
4- Try Scikit library  
5- It overfits even for linear case. Notice extreme overfitting when polynomial degree is 10


In [None]:
import csv
import matplotlib.pyplot as plt

def load_csv(filename):
    with open(filename, newline='') as csvfile:
        data_reader = csv.reader(csvfile)
        data = list(data_reader)[1:]  # Ignore the first row
        data = [row[1:] for row in data]  # Ignore the first column
    return data

def handle_missing_values(data):
    # Transpose the data to work with columns
    data_transposed = list(zip(*data))
    
    # Replace 'NA' with the mean of the column
    for i in range(len(data_transposed)):
        col = [float(x) if x != 'NA' else None for x in data_transposed[i]]
        mean_val = sum(x for x in col if x is not None) / len([x for x in col if x is not None])
        col = [x if x is not None else mean_val for x in col]
        data_transposed[i] = col
    
    # Transpose back to original format
    data = list(zip(*data_transposed))
    return data

def feature_scaling(features):
    scaled_features = []
    for i in range(len(features[0])):
        col = [row[i] for row in features]
        min_val = min(col)
        max_val = max(col)
        scaled_features.append([(x - min_val) / (max_val - min_val) for x in col])
    scaled_features = list(zip(*scaled_features))
    return scaled_features

def add_polynomial_features(features, degree=2):
    poly_features = []
    for row in features:
        poly_row = []
        for val in row:
            for d in range(1, degree+1):
                poly_row.append(val ** d)
        poly_features.append(poly_row)
    return poly_features

def split_features_labels(data):
    features = [list(map(float, row[:-1])) for row in data]
    labels = [float(row[-1]) for row in data]
    return features, labels

def predict(features, weights):
    return [sum(w * x for w, x in zip(weights, row)) for row in features]

def compute_cost(features, labels, weights):
    m = len(labels)
    predictions = predict(features, weights)
    return sum((pred - true) ** 2 for pred, true in zip(predictions, labels)) / (2 * m)

def gradient_descent(train_features, train_labels, test_features, test_labels, weights, learning_rate, epochs):
    m_train = len(train_labels)
    train_loss_history = []
    test_loss_history = []
    
    for epoch in range(epochs):
        train_predictions = predict(train_features, weights)
        test_predictions = predict(test_features, weights)
        
        gradients = [0] * len(weights)
        for i in range(len(weights)):
            gradients[i] = sum((pred - true) * train_features[j][i] for j, (pred, true) in enumerate(zip(train_predictions, train_labels))) / m_train
        weights = [w - learning_rate * g for w, g in zip(weights, gradients)]
        
        train_loss = compute_cost(train_features, train_labels, weights)
        test_loss = compute_cost(test_features, test_labels, weights)
        
        train_loss_history.append(train_loss)
        test_loss_history.append(test_loss)
        
        # Optional: Print progress
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Train Loss: {train_loss}, Test Loss: {test_loss}")
    
    return weights, train_loss_history, test_loss_history

# Load data
train_data = load_csv('house_price_train_subset.csv')
test_data = load_csv('house_price_test_subset.csv')

# Handle missing values
train_data = handle_missing_values(train_data)
test_data = handle_missing_values(test_data)

# Split features and labels
train_features, train_labels = split_features_labels(train_data)
test_features, test_labels = split_features_labels(test_data)

# Feature scaling
train_features = feature_scaling(train_features)
test_features = feature_scaling(test_features)

# Add polynomial features
polynomial_degree = 2
train_features = add_polynomial_features(train_features, degree=polynomial_degree)
test_features = add_polynomial_features(test_features, degree=polynomial_degree)

# Initialize weights
weights = [0.0] * len(train_features[0])

# Train model
learning_rate = 0.1
epochs = 400
weights, train_loss_history, test_loss_history = gradient_descent(train_features, train_labels, test_features, test_labels, weights, learning_rate, epochs)

# Plot loss curves
plt.figure(figsize=(10, 6))
plt.plot(range(epochs), train_loss_history, label='Train Loss')
plt.plot(range(epochs), test_loss_history, label='Test Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Curves')
plt.show()


Using Scikit library functions

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Load data
train_data = pd.read_csv('house_price_train_subset.csv', index_col=0)
test_data = pd.read_csv('house_price_test_subset.csv', index_col=0)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_data_imputed = imputer.fit_transform(train_data)
test_data_imputed = imputer.transform(test_data)

# Split features and labels
train_features = train_data_imputed[:, :-1]
train_labels = train_data_imputed[:, -1]
test_features = test_data_imputed[:, :-1]
test_labels = test_data_imputed[:, -1]

# Feature scaling
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

# Add polynomial features
poly = PolynomialFeatures(degree=2)
train_features_poly = poly.fit_transform(train_features_scaled)
test_features_poly = poly.transform(test_features_scaled)

# Train model
model = LinearRegression()
model.fit(train_features_poly, train_labels)

# Predictions
train_predictions = model.predict(train_features_poly)
test_predictions = model.predict(test_features_poly)

# Calculate loss
train_loss = mean_squared_error(train_labels, train_predictions) / 2
test_loss = mean_squared_error(test_labels, test_predictions) / 2

print(f'Train Loss: {train_loss}')
print(f'Test Loss: {test_loss}')

