<h3 style="color:green">Kigali REAL ESTATE Price Prediction (REAL TIME AI APPLICATION) [END-TO-END]</h3>

<h1 style="color:blue">DATA CLEANING</h1>

In [85]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='pandas')

# Load dataset
df1 = pd.read_csv("dataset/kigali_real_estate_prices.csv")
df1.shape  # Check the size of the dataset
df1.head()  # Display the first few rows




Unnamed: 0,Location,Property_Type,Size_sqm,Number_of_Bedrooms,Number_of_Bathrooms,Number_of_Floors,Year_Built,Parking_Space,Furnished,Proximity_to_City_Center_km,Proximity_to_Schools_km,Security_Features,Price_RWF
0,Kibagabaga,Bungalow,170,1,2,1,2018,1,1,6.0534,1.211984,1,3545686
1,Kacyiru,Bungalow,190,2,3,1,2003,0,0,8.04886,2.283878,1,5191761
2,Kanombe,Apartment,79,4,3,1,2011,0,0,2.917929,0.416259,1,3295061
3,Remera,Villa,174,3,3,1,1989,1,0,4.955305,3.478307,1,3488814
4,Kibagabaga,House,240,3,2,3,1990,0,0,8.438079,1.543986,1,6133847


In [86]:
len(df1)

20000

In [87]:
df1.shape

(20000, 13)

In [88]:
# remove record less than  205
# Size_sqm = "Size_sqm"
# df1 = df1[df1[Size_sqm] >= 205]

# df1.head()

In [89]:
# Check for missing values
df1.isnull().sum()



Location                       0
Property_Type                  0
Size_sqm                       0
Number_of_Bedrooms             0
Number_of_Bathrooms            0
Number_of_Floors               0
Year_Built                     0
Parking_Space                  0
Furnished                      0
Proximity_to_City_Center_km    0
Proximity_to_Schools_km        0
Security_Features              0
Price_RWF                      0
dtype: int64

In [90]:
# Remove leading/trailing whitespace in 'Location'
df1['Location'] = df1['Location'].apply(lambda x: x.strip())

# Check unique locations and property types
print(df1['Location'].unique())
print(df1['Property_Type'].unique())


['Kibagabaga' 'Kacyiru' 'Kanombe' 'Remera' 'Nyamirambo' 'Gacuriro'
 'Kimironko' 'Nyarutarama' 'Kicukiro']
['Bungalow' 'Apartment' 'Villa' 'House']


In [91]:

# Handle locations that occur fewer than 10 times
location_stats = df1.groupby('Location')['Location'].agg('count').sort_values(ascending=False)
location_stats_less_than_10 = location_stats[location_stats < 10]
df1['Location'] = df1['Location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

# Handle records with Year_Built < 1990
df1 = df1[~(df1.Year_Built < 1990)]

# Create a new column for price per square meter
df1['price_per_sqm'] = df1['Price_RWF'] / df1['Size_sqm']

<h1 style="color:blue">Outlier Removal</h1>

In [92]:
def remove_location_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('Location'):
        mean = np.mean(subdf.price_per_sqm)
        std_dev = np.std(subdf.price_per_sqm)
        reduced_df = subdf[(subdf.price_per_sqm > (mean - std_dev)) & (subdf.price_per_sqm <= (mean + std_dev))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df_cleaned = remove_location_outliers(df1)
df_cleaned


Unnamed: 0,Location,Property_Type,Size_sqm,Number_of_Bedrooms,Number_of_Bathrooms,Number_of_Floors,Year_Built,Parking_Space,Furnished,Proximity_to_City_Center_km,Proximity_to_Schools_km,Security_Features,Price_RWF,price_per_sqm
0,Gacuriro,Villa,149,4,2,3,2015,1,1,2.929362,1.466844,1,4019396,26975.812081
1,Gacuriro,Bungalow,214,2,2,3,2009,1,1,0.969455,1.283705,1,5488265,25646.098131
2,Gacuriro,Bungalow,135,3,2,1,2014,0,0,5.106890,1.952470,1,6448699,47768.140741
3,Gacuriro,Bungalow,109,5,1,1,1997,1,1,4.830527,0.000000,0,2097434,19242.513761
4,Gacuriro,Bungalow,98,5,2,2,2002,1,0,3.123421,0.682185,1,3088646,31516.795918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11834,Remera,Villa,193,3,3,3,1992,1,1,2.753904,2.679755,1,4774224,24736.911917
11835,Remera,Bungalow,191,5,1,2,1998,1,0,10.476284,0.938703,1,5345439,27986.591623
11836,Remera,House,167,2,3,3,1996,0,0,1.702134,1.116884,1,5994209,35893.467066
11837,Remera,House,221,3,3,2,2010,1,0,5.758835,2.229548,1,4165359,18847.778281


<h1 style="color:blue"> Feature Encoding for Location and Property_Type </h1>

In [93]:
# One-hot encoding for Location
location_dummies = pd.get_dummies(df_cleaned['Location'], drop_first=False)
# One-hot encoding for Property_Type
property_dummies = pd.get_dummies(df_cleaned['Property_Type'], drop_first=False)

# Concatenate the encoded columns with the original dataframe
df_encoded = pd.concat([df_cleaned, location_dummies, property_dummies], axis=1)

# Drop the original 'Location' and 'Property_Type' columns
df_final = df_encoded.drop(['Location', 'Property_Type'], axis=1)


In [94]:
location_dummies

Unnamed: 0,Gacuriro,Kacyiru,Kanombe,Kibagabaga,Kicukiro,Kimironko,Nyamirambo,Nyarutarama,Remera
0,True,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
11834,False,False,False,False,False,False,False,False,True
11835,False,False,False,False,False,False,False,False,True
11836,False,False,False,False,False,False,False,False,True
11837,False,False,False,False,False,False,False,False,True


In [95]:
property_dummies

Unnamed: 0,Apartment,Bungalow,House,Villa
0,False,False,False,True
1,False,True,False,False
2,False,True,False,False
3,False,True,False,False
4,False,True,False,False
...,...,...,...,...
11834,False,False,False,True
11835,False,True,False,False
11836,False,False,True,False
11837,False,False,True,False


In [96]:
df_encoded

Unnamed: 0,Location,Property_Type,Size_sqm,Number_of_Bedrooms,Number_of_Bathrooms,Number_of_Floors,Year_Built,Parking_Space,Furnished,Proximity_to_City_Center_km,...,Kibagabaga,Kicukiro,Kimironko,Nyamirambo,Nyarutarama,Remera,Apartment,Bungalow,House,Villa
0,Gacuriro,Villa,149,4,2,3,2015,1,1,2.929362,...,False,False,False,False,False,False,False,False,False,True
1,Gacuriro,Bungalow,214,2,2,3,2009,1,1,0.969455,...,False,False,False,False,False,False,False,True,False,False
2,Gacuriro,Bungalow,135,3,2,1,2014,0,0,5.106890,...,False,False,False,False,False,False,False,True,False,False
3,Gacuriro,Bungalow,109,5,1,1,1997,1,1,4.830527,...,False,False,False,False,False,False,False,True,False,False
4,Gacuriro,Bungalow,98,5,2,2,2002,1,0,3.123421,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11834,Remera,Villa,193,3,3,3,1992,1,1,2.753904,...,False,False,False,False,False,True,False,False,False,True
11835,Remera,Bungalow,191,5,1,2,1998,1,0,10.476284,...,False,False,False,False,False,True,False,True,False,False
11836,Remera,House,167,2,3,3,1996,0,0,1.702134,...,False,False,False,False,False,True,False,False,True,False
11837,Remera,House,221,3,3,2,2010,1,0,5.758835,...,False,False,False,False,False,True,False,False,True,False


In [97]:
df_final

Unnamed: 0,Size_sqm,Number_of_Bedrooms,Number_of_Bathrooms,Number_of_Floors,Year_Built,Parking_Space,Furnished,Proximity_to_City_Center_km,Proximity_to_Schools_km,Security_Features,...,Kibagabaga,Kicukiro,Kimironko,Nyamirambo,Nyarutarama,Remera,Apartment,Bungalow,House,Villa
0,149,4,2,3,2015,1,1,2.929362,1.466844,1,...,False,False,False,False,False,False,False,False,False,True
1,214,2,2,3,2009,1,1,0.969455,1.283705,1,...,False,False,False,False,False,False,False,True,False,False
2,135,3,2,1,2014,0,0,5.106890,1.952470,1,...,False,False,False,False,False,False,False,True,False,False
3,109,5,1,1,1997,1,1,4.830527,0.000000,0,...,False,False,False,False,False,False,False,True,False,False
4,98,5,2,2,2002,1,0,3.123421,0.682185,1,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11834,193,3,3,3,1992,1,1,2.753904,2.679755,1,...,False,False,False,False,False,True,False,False,False,True
11835,191,5,1,2,1998,1,0,10.476284,0.938703,1,...,False,False,False,False,False,True,False,True,False,False
11836,167,2,3,3,1996,0,0,1.702134,1.116884,1,...,False,False,False,False,False,True,False,False,True,False
11837,221,3,3,2,2010,1,0,5.758835,2.229548,1,...,False,False,False,False,False,True,False,False,True,False


<h1 style="color: blue">Prepare Features and Target Variables</h1>

In [98]:
# Dependent Variables
X = df_final.drop('Price_RWF', axis=1)
# Independent Variables
y = df_final['Price_RWF']

# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [99]:
X_train.shape

(9471, 24)

<h1 style="color:blue">Train a Model</h1>

In [100]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, cross_val_score

# Initialize and train the linear regression model
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)

# Evaluate model performance
print("Test score:", lr_clf.score(X_test, y_test))

# Cross-validation for a more robust evaluation
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), X, y, cv=cv)


Test score: 0.9354947800665687


array([0.93549478, 0.93476525, 0.93685399, 0.9385929 , 0.93545054])

<h1 style="color:green">Make Predictions Based on Location and Property_Type</h1>

In [103]:
def predict_price(Location, Property_Type, Size_sqm, Number_of_Bedrooms, Number_of_Bathrooms, Number_of_Floors, Parking_Space):
    # Create an empty array for feature inputs
    x = np.zeros(len(X.columns))
    
    # Set the corresponding feature values
    x[0] = Size_sqm
    x[1] = Number_of_Bedrooms
    x[2] = Number_of_Bathrooms
    x[3] = Number_of_Floors
    x[4] = Parking_Space
    
    # Set the location feature (assuming the location names are columns in X)
    if Location in X.columns:
        loc_index = np.where(X.columns == Location)[0][0]
        x[loc_index] = 1
    
    # Set the property type feature
    if Property_Type in X.columns:
        prop_index = np.where(X.columns == Property_Type)[0][0]
        x[prop_index] = 1
    
    # Return the predicted price
    return lr_clf.predict([x])[0]

# Example predictions
print(predict_price('NYARUGENGE', 'NYAMIRAMBO', 1000, 2, 1, 1, 0))
print(predict_price('Nyamirambo', 'House', 256, 3, 1, 3, 1))



20243698.798387576
1350014.7678016676




<h1 style="color:green">Saving the Model for Future Use</h1>

In [102]:
import pickle
# Save the trained model
with open('model_corrected/kigali_model.pickle', 'wb') as f:
    pickle.dump(lr_clf, f)

# Save the columns of the dataset for UI integration
import json
columns = {'data_columns': [col.lower() for col in X.columns]}
with open("json_corrected/columns.json", "w") as f:
    f.write(json.dumps(columns))
