In [102]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Re-import the datasets to ensure no previous changes affect the process
df = pd.read_csv("new_data.csv")



In [104]:
print("Duplicates in df:", df.duplicated().sum())


Duplicates in df: 24


In [106]:
# Drop duplicates from both df and df2
df = df.drop_duplicates()

In [108]:
print("Duplicates in df:", df.duplicated().sum())

Duplicates in df: 0


In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 196 entries, 0 to 219
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   price      196 non-null    object
 1   address    196 non-null    object
 2   city       196 non-null    object
 3   bedrooms   196 non-null    object
 4   bathrooms  196 non-null    object
 5   sqft       196 non-null    object
dtypes: object(6)
memory usage: 10.7+ KB


In [112]:
df=df.dropna()
df.head()

Unnamed: 0,price,address,city,bedrooms,bathrooms,sqft
0,"$229,900",339 Rupert St,"Thunder Bay, ON",3,2,759
1,"$319,000",703 Ruskin Cres,"Thunder Bay, ON",3,2,900
2,"$139,900",1319 East Donald St E,"Thunder Bay, ON",2,1,650
3,"$299,800",104 Ray Boulevard,"Thunder Bay, ON",4,2,1244
4,"$749,900",199 Valley St,"Thunder Bay, ON",5,4,2502


In [114]:
print(df.isnull().sum())

price        0
address      0
city         0
bedrooms     0
bathrooms    0
sqft         0
dtype: int64


In [116]:
df["price"] = df["price"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df["price"] = pd.to_numeric(df["price"], errors="coerce")  # Convert to numbers
print(df["price"].head(10))

0    229900
1    319000
2    139900
3    299800
4    749900
5    499900
6    389000
7    539000
8    169900
9    519900
Name: price, dtype: int64


In [118]:
df["sqft"] = df["sqft"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df["sqft"] = pd.to_numeric(df["sqft"], errors="coerce")  # Convert to numbers
print(df["sqft"].head(10))

0     759.0
1     900.0
2     650.0
3    1244.0
4    2502.0
5    1110.0
6    1100.0
7    1422.0
8    1600.0
9    1663.0
Name: sqft, dtype: float64


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 196 entries, 0 to 219
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   price      196 non-null    int64  
 1   address    196 non-null    object 
 2   city       196 non-null    object 
 3   bedrooms   196 non-null    object 
 4   bathrooms  196 non-null    object 
 5   sqft       161 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.7+ KB


In [122]:
df = df.dropna(subset=["sqft"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161 entries, 0 to 219
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   price      161 non-null    int64  
 1   address    161 non-null    object 
 2   city       161 non-null    object 
 3   bedrooms   161 non-null    object 
 4   bathrooms  161 non-null    object 
 5   sqft       161 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 8.8+ KB


In [124]:
df["bedrooms"] = pd.to_numeric(df["bedrooms"], errors="coerce")
df["bathrooms"] = pd.to_numeric(df["bathrooms"], errors="coerce")

# Check the updated data types
print(df.dtypes)

price          int64
address       object
city          object
bedrooms     float64
bathrooms    float64
sqft         float64
dtype: object


In [126]:
print(df['city'].unique())

['Thunder Bay, ON' 'Geraldton, ON' 'Kaministiquia, ON' 'Marathon, ON'
 'Beardmore, ON' 'Red Rock, ON' 'Atikokan, ON' 'Manitouwadge, ON'
 'Shuniah, ON' 'Murillo, ON' 'Rosslyn, ON' 'Hurkett, ON'
 'Oliver/Paipoonge, ON' 'Kakabeka Falls, ON' 'Longlac, ON' 'Nipigon, ON'
 'Rossport, ON' 'Dorion, ON' 'Nakina, ON' 'Thunder Bay' 'Manitouwadge'
 'Kaministiquia' 'Marathon' 'Red Rock' 'Obonga Lake' 'Geraldton' 'Ignace'
 'Terrace Bay' 'Nipigon' 'Longlac' 'Rossport']


In [128]:
df = df[df['city'].isin(['Thunder Bay, ON', 'Thunder Bay'])]

In [130]:
print(df['city'].unique())

['Thunder Bay, ON' 'Thunder Bay']


In [132]:
df.head(20)

Unnamed: 0,price,address,city,bedrooms,bathrooms,sqft
0,229900,339 Rupert St,"Thunder Bay, ON",3.0,2.0,759.0
1,319000,703 Ruskin Cres,"Thunder Bay, ON",3.0,2.0,900.0
2,139900,1319 East Donald St E,"Thunder Bay, ON",2.0,1.0,650.0
3,299800,104 Ray Boulevard,"Thunder Bay, ON",4.0,2.0,1244.0
4,749900,199 Valley St,"Thunder Bay, ON",5.0,4.0,2502.0
5,499900,119 Skyline Ave,"Thunder Bay, ON",4.0,2.0,1110.0
6,389000,3 270 Caribou Cres (Open House),"Thunder Bay, ON",2.0,2.0,1100.0
7,539000,192 Humber Cres,"Thunder Bay, ON",5.0,3.0,1422.0
10,319900,387 Morse St,"Thunder Bay, ON",3.0,2.0,886.0
13,349900,1508 Walsh St E,"Thunder Bay, ON",3.0,1.0,1130.0


In [134]:
df = df.drop(columns=['city'])

In [136]:
df.head(20)

Unnamed: 0,price,address,bedrooms,bathrooms,sqft
0,229900,339 Rupert St,3.0,2.0,759.0
1,319000,703 Ruskin Cres,3.0,2.0,900.0
2,139900,1319 East Donald St E,2.0,1.0,650.0
3,299800,104 Ray Boulevard,4.0,2.0,1244.0
4,749900,199 Valley St,5.0,4.0,2502.0
5,499900,119 Skyline Ave,4.0,2.0,1110.0
6,389000,3 270 Caribou Cres (Open House),2.0,2.0,1100.0
7,539000,192 Humber Cres,5.0,3.0,1422.0
10,319900,387 Morse St,3.0,2.0,886.0
13,349900,1508 Walsh St E,3.0,1.0,1130.0


In [138]:
df = df.dropna(subset=['bedrooms', 'bathrooms'])
df.head(40)

Unnamed: 0,price,address,bedrooms,bathrooms,sqft
0,229900,339 Rupert St,3.0,2.0,759.0
1,319000,703 Ruskin Cres,3.0,2.0,900.0
2,139900,1319 East Donald St E,2.0,1.0,650.0
3,299800,104 Ray Boulevard,4.0,2.0,1244.0
4,749900,199 Valley St,5.0,4.0,2502.0
5,499900,119 Skyline Ave,4.0,2.0,1110.0
6,389000,3 270 Caribou Cres (Open House),2.0,2.0,1100.0
7,539000,192 Humber Cres,5.0,3.0,1422.0
10,319900,387 Morse St,3.0,2.0,886.0
13,349900,1508 Walsh St E,3.0,1.0,1130.0


In [140]:
print(df['price'].unique())

[ 229900  319000  139900  299800  749900  499900  389000  539000  319900
  349900  429900  299900  209900  199900  239900  599900  289000  369900
  669900  275000  249900  549900  259900  269900  355000  449900  279900
  399900  259000  255000  849100  384000  339900  819900  699900  675000
  264900 1199900  360000  479900 1395000  439900  469900  325000 1250000
  669000  375000  314900]


In [142]:
!pip install geopy



In [144]:
intersection_mapping = {
    "339 Rupert St": "Red River Road & Junot Avenue",
    "703 Ruskin Cres": "Arthur Street & Edward Street",
    "1319 East Donald St E": "Memorial Avenue & John Street",
    "104 Ray Boulevard": "Red River Road & Junot Avenue",
    "199 Valley St": "Balmoral Street & Harbour Expressway",
    "119 Skyline Ave": "Fort William Road & Central Avenue",
    "3 270 Caribou Cres (Open House)": "Arthur Street & Edward Street",
    "192 Humber Cres": "Red River Road & Junot Avenue",
    "387 Morse St": "Balmoral Street & Harbour Expressway",
    "1508 Walsh St E": "Memorial Avenue & John Street",
    "115 200 Oasis Ln": "Red River Road & Junot Avenue",
    "1730 Rankin St": "Fort William Road & Central Avenue",
    "420 725 James Street S": "Arthur Street & Edward Street",
    "315 Ambrose St": "Balmoral Street & Harbour Expressway",
    "136 Christina St E": "Memorial Avenue & John Street",
    "1144 Crawford Ave": "Red River Road & Junot Avenue",
    "2505 Rosslyn Rd": "Fort William Road & Central Avenue",
    "332 Ogden": "Arthur Street & Edward Street",
    "1414 Ridgeway St E": "Balmoral Street & Harbour Expressway",
    "55 Secord St": "Memorial Avenue & John Street",
    "424 Gemstone Dr": "Red River Road & Junot Avenue",
    "308 1703 Victoria Ave E": "Fort William Road & Central Avenue",
    "48 Surprise Lake Rd W": "Arthur Street & Edward Street",
    "548 Red River Rd": "Balmoral Street & Harbour Expressway",
    "34 Simcoe St": "Memorial Avenue & John Street",
    "159 Inglewood Cres": "Red River Road & Junot Avenue",
    "276 Valley St": "Fort William Road & Central Avenue",
    "910 Alberta St": "Arthur Street & Edward Street",
    "544 Cumberland St N.": "Balmoral Street & Harbour Expressway",
    "425 Wiley St": "Memorial Avenue & John Street",
    "149 Bentwood Dr": "Red River Road & Junot Avenue",
    "259 Carl Ave": "Fort William Road & Central Avenue",
    "109 147 Fanshaw St": "Arthur Street & Edward Street",
    "232 Egan St": "Balmoral Street & Harbour Expressway",
    "89 Hill St": "Memorial Avenue & John Street",
    "20 Hanley Street": "Red River Road & Junot Avenue",
    "67 Regent St": "Fort William Road & Central Avenue",
    "934 Alexandra St": "Arthur Street & Edward Street",
    "254 McIntyre St": "Balmoral Street & Harbour Expressway",
    "152 Cougar Cres": "Memorial Avenue & John Street",
    "295 Frederica St E": "Red River Road & Junot Avenue",
    "2101 2260 Sleeping Giant Pkwy": "Fort William Road & Central Avenue",
    "401 725 James St S, 725 James St S #401": "Arthur Street & Edward Street",
    "308 300 Vista Ln": "Balmoral Street & Harbour Expressway",
    "334 Francis St E": "Memorial Avenue & John Street",
    "503 Longbow Cres": "Red River Road & Junot Avenue",
    "520 Tuscany Ct.": "Fort William Road & Central Avenue",
    "3968 Hwy 11/17": "Arthur Street & Edward Street",
    "513 145 Fanshaw St": "Balmoral Street & Harbour Expressway",
    "144 High St": "Memorial Avenue & John Street",
    "360 Arundel St": "Red River Road & Junot Avenue",
    "486 St. Patrick's Sq": "Fort William Road & Central Avenue",
    "114 Richmond St": "Arthur Street & Edward Street",
    "417 Parkway Dr": "Balmoral Street & Harbour Expressway",
    "2129 Arthur St E": "Memorial Avenue & John Street",
    "305 Wolseley St": "Red River Road & Junot Avenue",
    "165 Market St": "Fort William Road & Central Avenue",
    "338 Catherine St": "Arthur Street & Edward Street",
    "435 Muskrat Dr": "Balmoral Street & Harbour Expressway",
    "497 Muskrat Dr": "Memorial Avenue & John Street",
    "450 Vickers St N": "Red River Road & Junot Avenue",
    "315 145 Fanshaw St": "Fort William Road & Central Avenue"
}

# Map the addresses to the new 'intersection' column
df['intersection'] = df['address'].map(intersection_mapping)

In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62 entries, 0 to 215
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         62 non-null     int64  
 1   address       62 non-null     object 
 2   bedrooms      62 non-null     float64
 3   bathrooms     62 non-null     float64
 4   sqft          62 non-null     float64
 5   intersection  62 non-null     object 
dtypes: float64(3), int64(1), object(2)
memory usage: 3.4+ KB


In [148]:
df.to_csv('cleaned_new_data.csv', index=False)