In [3]:
# Step 1: Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load dataset from Google Drive (replace with your actual path)
file_path = '/content/drive/My Drive/CSE422 LAB/Assignment 4 (ML)/Dataset/Housing Price.xlsx'
df = pd.read_excel(file_path)

# Step 3: Display the initial dataset
print("Initial Dataset:")
print(df.head())

# Step 4: Remove null values
df_cleaned = df.dropna()
print("\nDataset after removing null values:")
print(df_cleaned.head())

# Step 5: Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()
print("\nDataset after removing duplicate rows:")
print(df_cleaned.head())

# Step 6: Handle categorical variables (Binary encoding and One-Hot Encoding)
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
df_cleaned[binary_columns] = df_cleaned[binary_columns].replace({'yes': 1, 'no': 0})

# One-Hot Encoding for 'furnishingstatus'
df_cleaned = pd.get_dummies(df_cleaned, columns=['furnishingstatus'], drop_first=True)

print("\nDataset after handling categorical variables:")
print(df_cleaned.head())

# Step 7: Feature scaling for continuous variables
scaler = StandardScaler()
df_cleaned[['price', 'area', 'parking']] = scaler.fit_transform(df_cleaned[['price', 'area', 'parking']])

print("\nDataset after feature scaling:")
print(df_cleaned[['price', 'area', 'parking']].head())

# Step 8: Remove variables with high correlation (threshold > 0.8)
corr_matrix = df_cleaned.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Identify features with high correlation
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

# Drop highly correlated features
df_cleaned.drop(columns=to_drop, inplace=True)

print("\nDataset after removing highly correlated features:")
print(df_cleaned.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial Dataset:
      price    area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420.0         4          2        3      yes        no       no   
1  12250000  8960.0         4          4        4      yes        no       no   
2  12250000  9960.0         3          2        2      yes        no      yes   
3  12215000     NaN         4          2        2      yes        no      yes   
4  11410000     NaN         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4           