In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 1: Load the dataset from a CSV file
file_path = "/content/drive/My Drive/Machine Learning Stanford College/Python/Quiz 7 Python/House_Rent_Dataset.csv"
df = pd.read_csv(file_path)

In [None]:
# Step 2: Explore the dataset (optional)
print("Dataset Preview:")
print(df.head())

Dataset Preview:
   Posted On  BHK  Size            Floor    Area Type  \
0  5/18/2022    2  1100  Ground out of 2   Super Area   
1  5/13/2022    2   800       1 out of 3   Super Area   
2  5/16/2022    2  1000       1 out of 3   Super Area   
3   7/4/2022    2   800       1 out of 2   Super Area   
4   5/9/2022    2   850       1 out of 2  Carpet Area   

              Area Locality     City Furnishing Status  Tenant Preferred  \
0                    Bandel  Kolkata       Unfurnished  Bachelors/Family   
1  Phool Bagan, Kankurgachi  Kolkata    Semi-Furnished  Bachelors/Family   
2   Salt Lake City Sector 2  Kolkata    Semi-Furnished  Bachelors/Family   
3               Dumdum Park  Kolkata       Unfurnished  Bachelors/Family   
4             South Dum Dum  Kolkata       Unfurnished         Bachelors   

   Bathroom Point of Contact   Rent  
0         2    Contact Owner  10000  
1         1    Contact Owner  20000  
2         1    Contact Owner  17000  
3         1    Contact Owner  1

In [None]:
#incase you want to drop a specific column (drop these columns as they are not needed in the model
df.drop(columns= ['Floor', 'Posted On', 'Point of Contact', 'Area Locality'], inplace=True)


In [None]:
# Step 3: Clean and preprocess the data
# 3a.# Handle missing values
# Replace missing values in numerical columns with their mean
import numpy as np
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

In [None]:
# Replace missing values in categorical columns with their mode
df['Area Type'].fillna(df['Area Type'].mode()[0], inplace=True) # Replace missing categorical values with the mode
df['City'].fillna(df['City'].mode()[0], inplace=True) # Replace missing categorical values with the mode
df['Furnishing Status'].fillna(df['Furnishing Status'].mode()[0], inplace=True) # Replace missing categorical values with the mode
df['Tenant Preferred'].fillna(df['Tenant Preferred'].mode()[0], inplace=True) # Replace missing categorical values with the mode


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Area Type'].fillna(df['Area Type'].mode()[0], inplace=True) # Replace missing categorical values with the mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].fillna(df['City'].mode()[0], inplace=True) # Replace missing categorical values with the mode
The behavior

In [None]:
print(df.head()) #to confirm if those columns have been dropped



   BHK  Size    Area Type     City Furnishing Status  Tenant Preferred  \
0    2  1100   Super Area  Kolkata       Unfurnished  Bachelors/Family   
1    2   800   Super Area  Kolkata    Semi-Furnished  Bachelors/Family   
2    2  1000   Super Area  Kolkata    Semi-Furnished  Bachelors/Family   
3    2   800   Super Area  Kolkata       Unfurnished  Bachelors/Family   
4    2   850  Carpet Area  Kolkata       Unfurnished         Bachelors   

   Bathroom   Rent  
0         2  10000  
1         1  20000  
2         1  17000  
3         1  10000  
4         1   7500  


In [None]:
# One-hot encode a column
df = pd.get_dummies(df, columns=['Area Type','Furnishing Status', 'City', 'Tenant Preferred'], drop_first=True)  #Use when the categorical variable is nominal (i.e., categories have no meaningful order).

In [None]:
print(df) # to confirm if the encoding has taken place

      BHK  Size  Bathroom   Rent  Area Type_Carpet Area  Area Type_Super Area  \
0       2  1100         2  10000                  False                  True   
1       2   800         1  20000                  False                  True   
2       2  1000         1  17000                  False                  True   
3       2   800         1  10000                  False                  True   
4       2   850         1   7500                   True                 False   
...   ...   ...       ...    ...                    ...                   ...   
4741    2  1000         2  15000                   True                 False   
4742    3  2000         3  29000                  False                  True   
4743    3  1750         3  35000                   True                 False   
4744    3  1500         2  45000                   True                 False   
4745    2  1000         2  15000                   True                 False   

      Furnishing Status_Sem

In [None]:
# Step 4: Define features (X) and target (y)
X = df.drop(columns=['Rent']) # Independent variables #this means part apart from Rent, all the other columns should be included in the X variables (so is dropped out becasue the Rent is Y)
y = df['Rent']  # Dependent variable (target) #the Y is what we are trying to predict

In [None]:
# Step 5: Split the data into training (70%) and testing sets (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Step 6: Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Step 7: Make predictions on the test data
y_pred = model.predict(X_test)

In [None]:
# Step 8: Evaluate the model
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
r2 = r2_score(y_test, y_pred)  # R-squared score  #higher r2 score means model is good: anything above the threshold the company wants it to be: it could be above 80% (0.80) or 90%

In [None]:
# Step 9: Display the predictions and evaluation metrics
test_results = pd.DataFrame({
    "Actual Price": y_test.values,
    "Predicted Price": y_pred,
    "Error (Actual - Predicted)": y_test.values - y_pred
})

In [None]:
print("=== Predictions on Test Data ===")
print(test_results)
print("\n=== Model Evaluation Metrics ===")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")
print("\n=== Model Coefficients ===")
print(f"Intercept: {model.intercept_}")
print(f"Coefficients: {dict(zip(X.columns, model.coef_))}")

=== Predictions on Test Data ===
      Actual Price  Predicted Price  Error (Actual - Predicted)
0            16000     30895.954381               -14895.954381
1            12000     18477.005330                -6477.005330
2            28000     67717.105587               -39717.105587
3             8000     75665.994279               -67665.994279
4            46000     89673.080743               -43673.080743
...            ...              ...                         ...
1419         39000     58258.420205               -19258.420205
1420        140000    114674.747320                25325.252680
1421         18500     34968.454003               -16468.454003
1422        120000    167503.890754               -47503.890754
1423         17500     31854.473707               -14354.473707

[1424 rows x 3 columns]

=== Model Evaluation Metrics ===
Mean Squared Error (MSE): 1626825685.3444104
Mean Absolute Error (MAE): 22353.54308385923
R-squared (R2): 0.537748065591956

=== Model Coeff