# This is for us to create a ML model to predict future housing prices 


In [4]:
# Pre-processing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
df = pd.read_csv("C:/Users/Admin/Desktop/Github/DAC-Internal-Projects/dataset/updated_dataset.csv")

# Handle missing values
df = df.dropna()

# Feature engineering: Create an 'affordable' column (binary target)
median_income = 5000  # Example median income
affordability_threshold = median_income * 0.3 * 12  # Annual affordability threshold
df['affordable'] = (df['resale_price'] <= affordability_threshold).astype(int)

# Normalize numerical columns
scaler = StandardScaler()
df[['floor_area', 'remaining_lease', 'resale_price']] = scaler.fit_transform(
    df[['floor_area', 'remaining_lease', 'resale_price']]
)

# Encode categorical variables
encoder = LabelEncoder()
df['town'] = encoder.fit_transform(df['town'])
df['flat_type'] = encoder.fit_transform(df['flat_type'])

# Split data into features and target
X = df.drop(['affordable', 'resale_price'], axis=1)  # Features
y = df['affordable']  # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


ModuleNotFoundError: No module named 'sklearn'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt

# Feature importance
importances = model.feature_importances_
feature_names = X.columns
sorted_indices = importances.argsort()

plt.figure(figsize=(10, 6))
plt.barh(range(len(importances)), importances[sorted_indices], align='center')
plt.yticks(range(len(importances)), feature_names[sorted_indices])
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.show()


In [None]:
import seaborn as sns

# Aggregate affordability by town
df['town_name'] = encoder.inverse_transform(df['town'])  # Convert encoded towns back
affordability_summary = df.groupby('town_name')['affordable'].mean().reset_index()
affordability_summary.columns = ['Town', 'Affordability_Percentage']

# Plot heatmap
plt.figure(figsize=(12, 6))
sns.barplot(data=affordability_summary, x='Town', y='Affordability_Percentage', palette='viridis')
plt.xticks(rotation=45)
plt.title("Affordability Percentage by Town")
plt.ylabel("Percentage of Affordable Flats")
plt.show()


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Train a Gradient Boosting Regressor
regressor = GradientBoostingRegressor(random_state=42)
regressor.fit(X_train, df.loc[X_train.index, 'resale_price'])

# Predict resale prices
y_pred_price = regressor.predict(X_test)

# Evaluate model
rmse = mean_squared_error(df.loc[X_test.index, 'resale_price'], y_pred_price, squared=False)
print("Root Mean Squared Error:", rmse)

# Calculate affordability score
affordability_scores = df.loc[X_test.index, 'resale_price'] - y_pred_price
df.loc[X_test.index, 'affordability_score'] = affordability_scores
