In [2]:
# Import, train and test the data.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('zoopla property listings.csv')

# Convert categorical variable 'propertyType' to dummy variables
df = pd.get_dummies(df, columns = ['propertyType'], drop_first = True)

# Select the relevant features and target variable
X = df[['bathrooms', 'bedrooms', 'livingRooms', 'floorAreaSqM'] + [col for col in df.columns if col.startswith('propertyType_')]]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [3]:
# Evaluating different models and their performance.

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialise models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
}

# Dictionary to store the evaluation metrics
evaluation_metrics = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    evaluation_metrics[name] = {'MAE': mae, 'MSE': mse, 'R2': r2}

# Print evaluation metrics
for name, metrics in evaluation_metrics.items():
    print(f"Model: {name}")
    print(f"MAE: {metrics['MAE']:.2f}")
    print(f"MSE: {metrics['MSE']:.2f}")
    print(f"R²: {metrics['R2']:.2f}")
    print("\n")


Model: Linear Regression
MAE: 246160.69
MSE: 168791141101.68
R²: 0.45


Model: Decision Tree
MAE: 219970.75
MSE: 125710964653.74
R²: 0.59


Model: Random Forest
MAE: 200624.32
MSE: 92935383119.49
R²: 0.70


Model: Gradient Boosting
MAE: 198877.03
MSE: 93956098059.59
R²: 0.70




In [4]:
# Predicting property prices with Random Forest model

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import joblib  # for saving the model

# Load the dataset
df = pd.read_csv('zoopla property listings.csv')

# Convert categorical variable 'propertyType' to dummy variables
df = pd.get_dummies(df, columns = ['propertyType'], drop_first = True)

# Select the relevant features and target variable
X = df[['bathrooms', 'bedrooms', 'livingRooms', 'floorAreaSqM'] + [col for col in df.columns if col.startswith('propertyType_')]]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators = 100, random_state = 42)
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# Load the trained model and scaler
model = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

In [6]:
# Example new data (replace with actual new data)
new_data = pd.DataFrame({
    'bathrooms': [2],
    'bedrooms': [5],
    'livingRooms': [1],
    'floorAreaSqM': [150],
    'propertyType_Detached': [1],
    'propertyType_Flat': [0],
    'propertyType_Semi-Detached': [0],
    'propertyType_Terraced': [0],
})

# Standardize the new data using the same scaler
new_data_scaled = scaler.transform(new_data)

# Make predictions
predicted_price = model.predict(new_data_scaled)
print(f"Predicted property price: {predicted_price[0]:.2f}")

Predicted property price: 818272.67


In [7]:
# OLS model for the website.
# Random Forest model could not be integrated with the website due to technical issues.

from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
data = pd.read_csv('zoopla property listings.csv')

# Convert categorical variable 'propertyType' to dummy variables
df = pd.get_dummies(data, columns = ['propertyType'], drop_first = True)

# Select the relevant features and target variable
X = df[['bathrooms', 'bedrooms', 'livingRooms', 'floorAreaSqM'] + [col for col in df.columns if col.startswith('propertyType_')]]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Standardize the features
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

# Create a linear regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Save the model for later use
import joblib
joblib.dump(model, 'linear_regression_model.pkl')

# Optionally, you can also print the feature names with their corresponding coefficients
coefficients = pd.DataFrame({
    'Feature': ['Intercept'] + X.columns.tolist(),
    'Coefficient': ([model.intercept_] + model.coef_.tolist())
})

print(round(coefficients, 2))

                      Feature  Coefficient
0                   Intercept   -102715.45
1                   bathrooms     32632.87
2                    bedrooms    -77382.64
3                 livingRooms   -159327.20
4                floorAreaSqM     11885.96
5       propertyType_Detached   -264020.45
6           propertyType_Flat     11580.90
7  propertyType_Semi-Detached   -237908.43
8       propertyType_Terraced   -116146.74
