### Importing necessary library

In [21]:
import pandas as pd
import numpy as np

import datetime

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest, chi2

### Reading the cleaned data file

In [2]:
rent_df = pd.read_csv('../Data/canada_rent_clean.csv')
rent_df.head(5)

Unnamed: 0,city,province,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,furnishing,availability_date,smoking,cats,dogs
0,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2495,2,2.5,1403,Unfurnished,2025-02-22,Non-Smoking,True,True
1,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2695,3,2.5,1496,Unfurnished,2025-02-22,Non-Smoking,True,True
2,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2295,2,2.5,1180,Unfurnished,2025-02-22,Non-Smoking,True,True
3,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2095,2,2.5,1403,Unfurnished,2025-11-18,Non-Smoking,True,True
4,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2495,2,2.5,1403,Unfurnished,2025-02-22,Non-Smoking,True,True


### Removing unecessary columns
---
As we have seen previously in the Cleaning_Data.ipynb, we well not be working with city name, since it would create tons of encoded columns. For simplicity, we will be working with latitude and longitude.

So, I will be removing the city column at this point.

---

I also will be removing the availability_date column, since I don't believe that this can affect the price.

In [3]:
rent_df = rent_df.drop(columns = ['availability_date', 'city'])

rent_df.head(2)

Unnamed: 0,province,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,furnishing,smoking,cats,dogs
0,Alberta,51.305962,-114.012515,Long Term,Townhouse,2495,2,2.5,1403,Unfurnished,Non-Smoking,True,True
1,Alberta,51.305962,-114.012515,Long Term,Townhouse,2695,3,2.5,1496,Unfurnished,Non-Smoking,True,True


### Encoding nominal categorical columns

---
As we have seen previously in the Data_Analysis.ipynb, there are five category columns. I believe from the five categorical column 4 are nominal.
  * lease_term
  * type
  * furnishing
  * smoking

In [4]:
# Create instance of OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output = False)

# Encode the categorical columns
encoded_cols = encoder.fit_transform(rent_df[['lease_term', 'type',
                                              'furnishing', 'smoking']])

# Put the encoded column into a dataframe
tmp_df = pd.DataFrame(encoded_cols, columns = encoder.get_feature_names_out(['lease_term', 
                                                                             'type',
                                                                             'furnishing',
                                                                             'smoking', 
                                                                             ]))

# Removing categorical columns
rent_df = rent_df.drop(columns=['lease_term', 'type',
                                'furnishing', 'smoking'])

# Join encoded dataframe to the rent_df dataframe
rent_df = rent_df.join(tmp_df)

rent_df.head()

Unnamed: 0,province,latitude,longitude,price,beds,baths,sq_feet,cats,dogs,lease_term_6 months,...,type_House,type_Loft,type_Main Floor,type_Mobile,type_Room For Rent,type_Townhouse,type_Vacation Home,furnishing_Negotiable,furnishing_Unfurnished,smoking_Smoking Allowed
0,Alberta,51.305962,-114.012515,2495,2,2.5,1403,True,True,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,Alberta,51.305962,-114.012515,2695,3,2.5,1496,True,True,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,Alberta,51.305962,-114.012515,2295,2,2.5,1180,True,True,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,Alberta,51.305962,-114.012515,2095,2,2.5,1403,True,True,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,Alberta,51.305962,-114.012515,2495,2,2.5,1403,True,True,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


### Encoding ordinal categorical columns

---
As we have seen previously in the Data_Analysis.ipynb, there are five categorical columns. I believe that the province can be classified as an ordinal categorical columns based on the amount of adds there are in the dataset for each province.

1- Alberta
2- Ontario
3- Quebec
4- British Columbia  
     ... and so on

     

In [15]:
count = rent_df.groupby(['province'])['cats'].count().sort_values(ascending=False)

rank = {province: rank for rank, province in enumerate(count.index, start=1)}

rent_df["province_numeric"] = rent_df["province"].map(rank)

rent_df = rent_df.drop(columns='province')

rent_df.head(5)

### Splitting the dataset into training vs testing dataset

In [34]:
# Need to turn longitude to positive to used SelectKBest
# We are only considering Canada. So, all longitude are negative.
# There is no positive values on our dataset.
rent_df['longitude'] = abs(rent_df['longitude'])

In [35]:
# Separate independent variable from dependent variable
X = rent_df.drop('price', axis=1)
y = rent_df['price']

# Split data into training/testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

### Find the best independent features to predict with

In [31]:
len(X.columns)

26

In [37]:
# Initialize SelectKBest with chi2, choose top 5 features
selector = SelectKBest(score_func=chi2, k=5)

# Fit and transform training data only
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature names
selected_features = selector.get_feature_names_out(X.columns)

# View top 5 features
print('The top 5 best independent features are :', selected_features)

The top 5 best independent features are : ['longitude' 'sq_feet' 'lease_term_6 months' 'type_Basement'
 'type_Room For Rent']


In [38]:
# Initialize SelectKBest with chi2, choose top 10 features
selector = SelectKBest(score_func=chi2, k=10)

# Fit and transform training data only
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature names
selected_features = selector.get_feature_names_out(X.columns)

# View top 10 features
print('The top 10 best independent features are :', selected_features)

The top 10 best independent features are : ['longitude' 'beds' 'baths' 'sq_feet' 'lease_term_6 months'
 'lease_term_Short Term' 'type_Basement' 'type_House' 'type_Room For Rent'
 'province_numeric']


In [40]:
# Initialize SelectKBest with chi2, choose top 15 features
selector = SelectKBest(score_func=chi2, k=15)

# Fit and transform training data only
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature names
selected_features = selector.get_feature_names_out(X.columns)

# View top 15 features
print('The top 15 best independent features are :', selected_features)

The top 15 best independent features are : ['latitude' 'longitude' 'beds' 'baths' 'sq_feet' 'lease_term_6 months'
 'lease_term_Negotiable' 'lease_term_Short Term' 'type_Basement'
 'type_Condo Unit' 'type_House' 'type_Room For Rent' 'type_Townhouse'
 'smoking_Smoking Allowed' 'province_numeric']


In [41]:
# Initialize SelectKBest with chi2, choose top 15 features
selector = SelectKBest(score_func=chi2, k=20)

# Fit and transform training data only
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature names
selected_features = selector.get_feature_names_out(X.columns)

# View top 20 features
print('The top 20 best independent features are :', selected_features)

The top 20 best independent features are : ['latitude' 'longitude' 'beds' 'baths' 'sq_feet' 'cats' 'dogs'
 'lease_term_6 months' 'lease_term_Negotiable' 'lease_term_Short Term'
 'type_Basement' 'type_Condo Unit' 'type_House' 'type_Loft'
 'type_Main Floor' 'type_Room For Rent' 'type_Townhouse'
 'furnishing_Negotiable' 'smoking_Smoking Allowed' 'province_numeric']
