# Lab #5



### Instructions: 

- In this lab, we are going to have a competition. 
- Try to create the best model that can predict the prices of the houses in america.
- You can choose which columns to use, encoders to use, models to use etc.
- In the end, submit your results to the form in the end. (One submission is enough for each group)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor, TweedieRegressor, SGDRegressor, LassoLars
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import r2_score

## Exploratory Analysis

### Import the Dataset

In [None]:
df = pd.read_csv("./data/American_Housing_Data.csv")
df['Zip Code'] = df['Zip Code'].astype(str)
print(df.columns)
df

### Plot the relationships

- You can change x_axis with other columns to see the relationship

In [None]:
x_axis = 'Beds'

plt.figure(figsize=(10, 6))
sns.scatterplot(x=x_axis, y='Price', data=df, color='blue', alpha=0.6)
plt.title(f'Relationship between {x_axis} and Price')
plt.xlabel(f'{x_axis}')
plt.ylabel('Price')
plt.show()

## Model Training

#### Decide which columns to use for predicting the price:

('Zip Code', 'Beds', 'Baths', 'Living Space', 'Address', 'City', 'State', 'Zip Code Population', 'Zip Code Density', 'County', 'Median Household Income', 'Latitude', 'Longitude')

In [None]:
columns = ['Zip Code', 'Price', 'Beds', 'Baths', 'Living Space', 'Address', 'City', 'State', 'Zip Code Population', 'Zip Code Density', 'County', 'Median Household Income', 'Latitude', 'Longitude']
df = df[columns]
df.dtypes

#### Decide whether using a Scaler for numeric columns

In [None]:
print('Numerical columns:', df.select_dtypes(include=[np.number]).columns.tolist())

In [None]:
scaler = StandardScaler()
# scaler = MinMaxScaler()

numeric_data = df.select_dtypes(include=[np.number])
numeric_data = pd.DataFrame(scaler.fit_transform(numeric_data), columns=numeric_data.columns)
numeric_data['Price'] = df['Price']

# update original df with the scaled data (delete if you don't want to use scaler)
df[df.select_dtypes(include=[np.number]).columns] = numeric_data 
df

#### Decide whether using Encoder for categorical columns

In [None]:
print('Categorical columns:', df.select_dtypes(include=[np.object_]).columns.tolist())

In [None]:
encoder = OrdinalEncoder()

categorical_data = df.select_dtypes(include=[np.object_])
encoded_data = encoder.fit_transform(categorical_data)
encoded_df = pd.DataFrame(encoded_data, columns=categorical_data.columns)

# update original df with the encoded data (comment out if you don't want to use encoder)
df[df.select_dtypes(include=[np.object_]).columns] = encoded_df
df

In [None]:
X = df.drop('Price', axis=1).drop(df.select_dtypes(include=[np.object_]).columns, axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

- Uncomment the model you want to use (you can play with the parameters too).
- You can find the details about the models from https://scikit-learn.org/stable/supervised_learning.html#supervised-learning

In [None]:
# ------------------------------------------------------------ #
# ------------------------------------------------------------ #

#### Linear Models

model = LinearRegression()
# model = Ridge(alpha=0.5)
# model = Lasso(alpha=0.1)
# model = LassoLars(alpha=0.1)
# model = ElasticNet(alpha=0.1, l1_ratio=0.5)
# model = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06)
# model = HuberRegressor(epsilon=1.35, alpha=0.0001)
# model = TweedieRegressor(alpha=1.0, power=0.0)

#### Ensemble multiple linear models

# model = RandomForestRegressor(n_estimators=5)
# model = BaggingRegressor(estimator=LinearSVR(), n_estimators=5)
# model = VotingRegressor(estimators=[('m1', BayesianRidge()), ('m2', Ridge()), ('m3', ElasticNet())])


#### Multi-layer Perceptron

# model = MLPRegressor(hidden_layer_sizes=(32, 16, 8), max_iter=200, verbose=True, learning_rate='adaptive', learning_rate_init=0.01)

# ------------------------------------------------------------ #
# ------------------------------------------------------------ #

#### Training and evaluation
print('Model:', model)
print('Training started. Please wait...')

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'Mean Absolute Error: {np.mean(np.abs(y_test - y_pred))}')
print(f'Score: {r2_score(y_test, y_pred)}')

In [None]:
scores = {
    "Mean Absolute Error": np.mean(np.abs(y_test - y_pred)),
    "R2 Score": r2_score(y_test, y_pred),
}
print(scores)

plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([0, max(y_test.max(), y_pred.max())], [0, max(y_test.max(), y_pred.max())], color='red', linewidth=2, linestyle='--')
plt.text(0.05, 0.9, f'Ideally, all points should be on the red line', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.05, 0.8, '\n'.join([f'{k}: {v:.3f}' for k, v in scores.items()]), fontsize=10, transform=plt.gca().transAxes)
plt.title('Predicted vs Actual Price')
plt.ylabel('Predicted Price')
plt.xlabel('Actual Price')
plt.xscale('log')
plt.yscale('log')
plt.show()

## Submit your model and score:

https://forms.gle/QA2vu8n6YyrZjYiS8