# 2.1 
Give 3 examples of continuous and categorical features in the dataset; choose one feature of each
type and plot the histogram to illustrate the distribution.

# Continuous Features:

* MasVnrArea: Masonry veneer area in square feet
* 1stFlrSF: First Floor square feet
* GarageArea: Size of garage in square feet

# Categorical Features:

* MasVnrType: Masonry veneer type
* ExterQual: Quality of the material on the exterior
* BsmtCond: General condition of the basement

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('./data/train.csv')
print("Full train dataset shape is {}".format(df.shape))
df.head(90)

In [None]:
# Assuming df is your DataFrame
# df = pd.read_csv('your_dataset.csv')

# Continuous features
continuous_features = ['LotFrontage', 'LotArea', 'MasVnrArea']

# Categorical features
categorical_features = ['MSSubClass', 'MSZoning', 'Street']

# Initialize a 4x4 grid for subplots
fig, axes = plt.subplots(4, 4, figsize=(16, 16))

# Flatten the axes array to easily iterate
flattened_axes = axes.flatten()

# Plot histograms for continuous features
for i, feature in enumerate(continuous_features):
    ax = flattened_axes[i]
    ax.hist(df[feature].dropna(), bins=30, edgecolor='black')
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')

# Plot histograms for categorical features
for i, feature in enumerate(categorical_features):
    ax = flattened_axes[i + len(continuous_features)]
    df[feature].value_counts().plot(kind='bar', edgecolor='black', ax=ax)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')

# Remove unused subplots
for i in range(len(continuous_features) + len(categorical_features), len(flattened_axes)):
    fig.delaxes(flattened_axes[i])

plt.tight_layout()
plt.show()

In [None]:
# Continuous features
continuous_features = ['LotFrontage', 'LotArea', 'MasVnrArea']

# Categorical features
categorical_features = ['MSSubClass', 'MSZoning', 'Street']

# Handling Missing Values
for feature in continuous_features:
    df[feature].fillna(df[feature].median(), inplace=True)
for feature in categorical_features:
    df[feature].fillna(df[feature].mode()[0], inplace=True)

# Normalizing Numerical Features
scaler = MinMaxScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])

# Encoding Categorical Features
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Update categorical_features list after one-hot encoding
updated_categorical_features = [col for col in df.columns if any(feature in col for feature in categorical_features)]

# Calculate grid size
total_features = len(continuous_features) + len(updated_categorical_features)
grid_size = int(np.ceil(np.sqrt(total_features)))

# Initialize grid for subplots
fig, axes = plt.subplots(grid_size, grid_size, figsize=(16, 16))
flattened_axes = axes.flatten()

# Plot histograms for continuous features
for i, feature in enumerate(continuous_features):
    ax = flattened_axes[i]
    ax.hist(df[feature], bins=30, edgecolor='black')
    ax.set_title(f'Distribution of {feature}')

# Plot histograms for categorical features
for i, feature in enumerate(updated_categorical_features):
    ax = flattened_axes[i + len(continuous_features)]
    df[feature].value_counts().plot(kind='bar', edgecolor='black', ax=ax)
    ax.set_title(f'Distribution of {feature}')

# Remove unused subplots
for i in range(total_features, len(flattened_axes)):
    fig.delaxes(flattened_axes[i])

plt.tight_layout()
plt.show()

# 2.3 Pre-Process the data
I combined Test + Training and pre-processed the combined dataset based on the given columns within the dataset. 

In [None]:
# Before Pre-Processing

# Histogram for a numerical column
sns.histplot(combined_df['LotArea'], kde=False)
plt.title('Distribution of LotArea Before Preprocessing')
plt.show()

# Boxplot for a categorical column against a numerical column
sns.boxplot(x='MSZoning', y='SalePrice', data=combined_df)
plt.title('SalePrice by MSZoning Before Preprocessing')
plt.show()


In [None]:
# Debugging step:


if combined_df.empty:
    print("The DataFrame is empty. Check your data.")
print("Rows with any NA values: ", combined_df.isna().any(axis=1).sum())

# Check the initial datasets
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)

# Combine train and test datasets
combined_df = pd.concat([train_df, test_df], ignore_index=True)

print("Combined DataFrame shape:", combined_df.shape)

missing_cols = [col for col in numerical_cols if col not in combined_df.columns]
if missing_cols:
    print(f"Missing columns: {missing_cols}")
else:
    print("All numerical columns are present.")

na_counts = combined_df[numerical_cols].isna().sum()
print("NA counts in numerical columns:", na_counts)

temp_df = combined_df[numerical_cols]
print("Subset DataFrame shape:", temp_df.shape)

for col in numerical_cols:
    median_val = combined_df[col].median()
    combined_df[col].fillna(median_val, inplace=True)

scaler = StandardScaler()
combined_df[numerical_cols] = scaler.fit_transform(combined_df[numerical_cols])

In [None]:
# Load the training and testing CSV file
training_df = pd.read_csv('./data/train.csv')
testing_df = pd.read_csv('./data/test.csv')

# Identify numerical columns (excluding 'SalePrice' for the testing set)
numerical_cols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
                  'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
                  'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
                  'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 
                  'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
                  '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

# Step 1: Handle Missing Values for both training and testing
for col in numerical_cols:
    median_val = training_df[col].median()
    training_df[col].fillna(median_val, inplace=True)
    testing_df[col].fillna(median_val, inplace=True)

# Step 2: Normalize Numerical Variables for Training
scaler_train = StandardScaler()
training_df[numerical_cols + ['SalePrice']] = scaler_train.fit_transform(training_df[numerical_cols + ['SalePrice']])

# Step 2: Normalize Numerical Variables for Testing
scaler_test = StandardScaler()
testing_df[numerical_cols] = scaler_test.fit_transform(testing_df[numerical_cols])

# Step 3: Encode Categorical Variables for both training and testing
categorical_cols = training_df.columns.difference(['Id'] + numerical_cols + ['SalePrice'])
label_encoder = LabelEncoder()

for col in categorical_cols:
    combined_data = pd.concat([training_df[col], testing_df[col]]).astype(str)
    label_encoder.fit(combined_data)
    
    training_df[col] = label_encoder.transform(training_df[col].astype(str))
    testing_df[col] = label_encoder.transform(testing_df[col].astype(str))

# Save the pre-processed data back to new CSV files
training_df.to_csv('./data/pre-processed_training.csv', index=False)
testing_df.to_csv('./data/pre-processed_testing.csv', index=False)

# 2.4

In [None]:
# Reload the original CSV into a DataFrame
df = pd.read_csv('./data/pre-processed_training.csv')

# Check if 'Street' is in the DataFrame columns
if 'Street' in df.columns:
    # One-hot encode 'Street'
    df_onehot = pd.get_dummies(df, columns=['Street'], drop_first=True)
    
    # Update list and proceed as before
    updated_categorical_features = [col for col in df_onehot.columns if 'Street' in col]
    
    # Initialize 2x2 grid for subplots
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    
    # Plot histogram for original 'Street' feature
    df['Street'].value_counts().plot(kind='bar', ax=axes[0])
    axes[0].set_title('Original Street Feature')
    
    # Plot histogram for one-hot encoded 'Street' feature
    df_onehot[updated_categorical_features].sum().plot(kind='bar', ax=axes[1])
    axes[1].set_title('One-Hot Encoded Street Feature')
    
    plt.show()

else:
    print("'Street' column not found in DataFrame.")

# 2.5

In [None]:
# Loading your data
df = pd.read_csv('./data/pre-processed_training.csv')

# Features and target variable
selected_features = ['OverallQual', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars']
X_train = df[selected_features]
y_train = df['SalePrice']

# OLS Model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = regressor.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print(f'MSE: {mse}')
print(f'R2 Score: {r2}')

In [None]:
print("Any NaN in y_train:", torch.isnan(y_train_tensor).any())
print("Any NaN in y_pred:", np.isnan(y_pred).any())

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

# Define model
class LinearRegressionModel(nn.Module):
    def __init__(self):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(len(selected_features), 1)
        
    def forward(self, x):
        return self.linear(x)

model = LinearRegressionModel()

# Loss and optimizer with different learning rate
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.005)

# To store loss values for plotting or analysis
loss_values = []

# Training with logging
for epoch in range(10000):
    model.train()
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    # Store loss
    loss_values.append(loss.item())
    
    # Print loss every 1000 epochs
    if epoch % 1 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Evaluate
model.eval()
with torch.no_grad():
    y_pred = model(X_train_tensor).detach().numpy()

mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print(f'MSE: {mse}')
print(f'R2 Score: {r2}')

# 2.7 Train the Model

In [None]:
# Load pre-processed data
training_df = pd.read_csv('./data/pre-processed_training.csv')
testing_df = pd.read_csv('./data/pre-processed_testing.csv')

# Initialize StandardScaler for features and target
scaler_features = StandardScaler()
scaler_target = StandardScaler()

# Separate features and labels for training data
X_train = training_df.drop(columns=['SalePrice'])
y_train = training_df['SalePrice']
X_test = testing_df  # Testing set doesn't have 'SalePrice'

# Scale features
X_train = scaler_features.fit_transform(X_train)
X_test = scaler_features.transform(X_test)

# Scale target ('SalePrice')
y_train = scaler_target.fit_transform(y_train.to_frame()).ravel()

# Feature selection
selector = SelectKBest(score_func=f_regression, k=20)  # Select top 20 features
X_train_new = selector.fit_transform(X_train, y_train)
X_test_new = selector.transform(X_test)

# Train the Ridge regression model
model = Ridge(alpha=1.0)  # You can tune alpha for better performance
model.fit(X_train_new, y_train)

# Make predictions
y_pred = model.predict(X_test_new)

# Create a DataFrame to hold the predicted values
predicted_df = pd.DataFrame(y_pred, columns=['Predicted_SalePrice'])

# Re-scale the predicted and actual values back to the original scale
predicted_df['Predicted_SalePrice'] = scaler_target.inverse_transform(predicted_df[['Predicted_SalePrice']])
y_train_original_scale = scaler_target.inverse_transform(y_train.reshape(-1, 1))

# Plot histograms
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(y_train_original_scale, bins=50, alpha=0.5, color='g', label='Actual SalePrice (Train)')
plt.title('Actual Sale Prices in Training Data')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(predicted_df['Predicted_SalePrice'], bins=50, alpha=0.5, color='b', label='Predicted SalePrice')
plt.title('Predicted Sale Prices in Test Data')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Load the first CSV file into a DataFrame
df_train = pd.read_csv('./data/train.csv')

# Load the second CSV file into another DataFrame
df_test = pd.read_csv('./data/test.csv')

# Combine the two DataFrames
combined_df = pd.concat([df_train, df_test], ignore_index=True)

# Save the combined DataFrame back to a new CSV file
combined_df.to_csv('./data/combined.csv', index=False)