In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Re-import the datasets to ensure no previous changes affect the process
df = pd.read_csv("kijiji_cleaned_listings.csv")
df2 = pd.read_csv("null_values_dataset.csv")



In [9]:
print("Duplicates in df:", df.duplicated().sum())
print("Duplicates in df2:", df2.duplicated().sum())

Duplicates in df: 72
Duplicates in df2: 256


In [11]:
# Drop duplicates from both df and df2
df = df.drop_duplicates()
df2 = df2.drop_duplicates()

In [13]:
print("Duplicates in df:", df.duplicated().sum())
print("Duplicates in df2:", df2.duplicated().sum())

Duplicates in df: 0
Duplicates in df2: 0


In [15]:
df.info()
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, 0 to 8
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 9 non-null      object 
 1   Price                 9 non-null      float64
 2   Description           9 non-null      object 
 3   Distance              9 non-null      int64  
 4   Nearest_Intersection  9 non-null      object 
 5   Bedrooms              9 non-null      int64  
 6   Bathrooms             9 non-null      int64  
 7   Unit_type             9 non-null      object 
 8   Parking               9 non-null      int64  
 9   Size(sqft)            9 non-null      float64
dtypes: float64(2), int64(4), object(4)
memory usage: 792.0+ bytes
<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 0 to 31
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title               

In [17]:
df2["Price"] = df2["Price"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df2["Price"] = pd.to_numeric(df2["Price"], errors="coerce")  # Convert to numbers
print(df2["Price"].head(10))

0    120000000.0
1     27790000.0
2        85000.0
3        75000.0
4        36700.0
5     74000000.0
6        47500.0
7        79500.0
8        62500.0
9     34500000.0
Name: Price, dtype: float64


In [19]:
df2["Distance"] = df2["Distance"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df2["Distance"] = pd.to_numeric(df2["Distance"], errors="coerce")  # Convert to numbers
print(df2["Distance"].head(10))

0     5
1    13
2     5
3     4
4     4
5    33
6     3
7     3
8     6
9     4
Name: Distance, dtype: int64


In [21]:
df2["Size(sqft)"] = df2["Size(sqft)"].str.replace(r"[^\d]", "", regex=True)  # Remove non-numeric characters
df2["Size(sqft)"] = pd.to_numeric(df2["Size(sqft)"], errors="coerce")  # Convert to numbers
print(df2["Size(sqft)"].head(10))

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
9   NaN
Name: Size(sqft), dtype: float64


In [23]:
df.info()
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, 0 to 8
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 9 non-null      object 
 1   Price                 9 non-null      float64
 2   Description           9 non-null      object 
 3   Distance              9 non-null      int64  
 4   Nearest_Intersection  9 non-null      object 
 5   Bedrooms              9 non-null      int64  
 6   Bathrooms             9 non-null      int64  
 7   Unit_type             9 non-null      object 
 8   Parking               9 non-null      int64  
 9   Size(sqft)            9 non-null      float64
dtypes: float64(2), int64(4), object(4)
memory usage: 792.0+ bytes
<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 0 to 31
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title               

In [29]:
# Encode categorical features in both df and df2
categorical_cols = ["Nearest_Intersection", "Unit_type"]
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Encode training data
    df2[col] = df2[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else np.nan)  # Encode test data
    encoders[col] = le  # Store encoder for inverse transform

df.head(10)

Unnamed: 0,Title,Price,Description,Distance,Nearest_Intersection,Bedrooms,Bathrooms,Unit_type,Parking,Size(sqft)
0,Single bedroom for rent,60000.0,Single bedroom available for rent at Robertson...,4,7,4,2,2,0,1500.0
1,Two bed one bath basement apartment,170000.0,Two bed 1 bath basement apartment available fo...,4,5,2,1,1,1,1500.0
2,Two bed 1 bath house,210000.0,Two bed 1 bath house available for rent in Nor...,4,1,2,1,2,1,1500.0
3,Two bedroom apartment with utilities included.,195000.0,This two bedroom apartment has just finished b...,4,1,2,1,0,1,1000.0
4,Updated Two Bedroom Current River Townhouse,160000.0,BEAUTIFUL 2 BEDROOM TOWNHOUSE CURRENT RIVER! A...,4,0,2,1,3,1,800.0
5,2 bed 1 bath Apartment at 525 Ford St N - Apt 1,159500.0,"Available Feb 1st this stunning, newly renovat...",3,6,2,1,0,1,750.0
6,Two bedroom apartment for rent,130000.0,Two bedroom apartment available now/ Apartment...,8,4,2,1,0,1,1000.0
7,"New Construction: 2 Bedroom, 1 Bathroom. Occup...",200000.0,Welcome home! New Construction. 459 Hodder Ave...,8,3,2,1,0,1,1200.0
8,"New Construction: UPSTAIRS 2 Bedroom, 1 Bathro...",220000.0,"Welcome home! 1316 Mary Street West, Unit 3. I...",7,2,2,1,0,1,1100.0


In [31]:
df2.head(20)

Unnamed: 0,Title,Price,Description,Distance,Nearest_Intersection,Bedrooms,Bathrooms,Unit_type,Parking,Size(sqft)
0,7 Unit Apartment Building for Sale!!,120000000.0,"Built in 1985, it contains 7 units, 1x1 bedroo...",5,,,,,,
1,"0 HWY 130 Thunder Bay, Ontario",27790000.0,Oliver Paipoonge Vacant Land! 7.6 Acres locate...,13,,,,,,
2,Room for Rent furnished!!,85000.0,"I have a room for rent furnished with bed, des...",5,,,,,,
3,Room for rent,75000.0,Shared 2 bedroom duplex,4,,,,,,
4,Roommate needed to share a room for female,36700.0,Shared room available for female in 3 Bedroom ...,4,,,,,,
5,"350 Mokomon rd, Kakabeka Falls, ON",74000000.0,Amazing luxurious country living. Newly built ...,33,,,,,,
6,Room for Rent - All Inclusive - Fully Furnished,47500.0,Fully furnished private room in fully furnishe...,3,,,,,,
7,furnished private room 3 minutes to university,79500.0,furnished room for rent asking 795 a month hea...,3,,,,,,
8,room for rent,62500.0,Bedroom for rent in new home westfort parking ...,6,,,,,,
9,"48 JEWELL ST THUNDER BAY, Ontario",34500000.0,Charming Corefloor Home in Sought-After North ...,4,,,,,,


In [47]:


# Function to train model and predict missing values for df2 based on df
def train_and_predict(df, df2, target_col, model_type="regression"):
    # Check if there are missing values in target column in df2
    if df2[target_col].isnull().sum() == 0:
        return  # No missing values to predict

    # Define features (excluding the target column) in df and df2
    features = [col for col in df.columns if col != target_col and pd.api.types.is_numeric_dtype(df[col])]
    
    # Train/test split for complete dataset df
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df[target_col], test_size=0.2, random_state=42
    )
    
    # Choose model based on target type
    if model_type == "regression":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    else:  # Classification for categorical features
        model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    model.fit(X_train, y_train)
    
    # Predict missing values in df2
    missing_data = df2[df2[target_col].isnull()]
    missing_features = missing_data[features]
    predictions = model.predict(missing_features)
    
    # Fill missing values in df2
    df2.loc[df2[target_col].isnull(), target_col] = predictions

# Apply model training and prediction for each column with missing values in df2
train_and_predict(df, df2, "Price", model_type="regression")
train_and_predict(df, df2, "Distance", model_type="regression")
train_and_predict(df, df2, "Nearest_Intersection", model_type="classification")
train_and_predict(df, df2, "Bedrooms", model_type="regression")
train_and_predict(df, df2, "Bathrooms", model_type="regression")
train_and_predict(df, df2, "Unit_type", model_type="classification")
train_and_predict(df, df2, "Parking", model_type="regression")
train_and_predict(df, df2, "Size(sqft)", model_type="regression")

# Round numeric values correctly
df2["Price"] = df2["Price"].round(2)
df2["Distance"] = df2["Distance"].round(2)
df2["Size(sqft)"] = df2["Size(sqft)"].round(2)

# Ensure integer values for discrete numerical columns
df2["Bedrooms"] = df2["Bedrooms"].round().astype("Int64")  
df2["Bathrooms"] = df2["Bathrooms"].round().astype("Int64")
df2["Parking"] = df2["Parking"].round().astype("Int64")
df2["Size(sqft)"] = (df2["Size(sqft)"] / 50).round() * 50
df2["Size(sqft)"] = df2["Size(sqft)"].astype("Int64")  # Ensures whole numbers

# Save the cleaned and imputed dataset
combined_df = pd.concat([df, df2], ignore_index=True)

# Save the combined dataset
combined_df.to_csv("combined_cleaned_imputed_dataset.csv", index=False)

print("Missing values imputed. Cleaned dataset saved as 'combined_cleaned_imputed_dataset.csv'.")


Missing values imputed. Cleaned dataset saved as 'combined_cleaned_imputed_dataset.csv'.
