<a href="https://colab.research.google.com/github/Akshayus29/Assignments/blob/main/preprocessing_case_study_house_pricing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

In [None]:
house_pricing = pd.read_csv('/content/House_Pricing.csv')
house_pricing

In [None]:
house_pricing.info()

In [None]:
house_pricing.describe()

In [None]:
#Checking duplicates
house_pricing.duplicated().sum()

### **Handling missing values**

In [None]:
#finding missing values
house_pricing.isna().sum()


In [None]:
cols_t_fix = [
    'Sale Price',
    'No of Bathrooms',
    'Flat Area (in Sqft)',
    'Lot Area (in Sqft)',
    'Area of the House from Basement (in Sqft)'
]


for col in cols_t_fix:
    median_value = house_pricing[col].median()
    house_pricing[col] = house_pricing[col].fillna(median_value)



In [None]:
house_pricing.isna().sum()


In [None]:
cols_to_drop = [
    'No of Times Visited',
    'Zipcode',
    'Latitude',
    'Longitude',
    'Living Area after Renovation (in Sqft)'
]

for col in cols_to_drop:
    if col in house_pricing.columns:
        house_pricing.drop(col, axis=1, inplace=True)



In [None]:
house_pricing.isna().sum()


In [None]:
house_pricing.hist(figsize = (20, 15))
plt.show()

### **Finding Outlier**

In [None]:
house_pricing1 = house_pricing.copy()   # your cleaned dataset

num_cols = house_pricing1.select_dtypes(include=['int64', 'float64']).columns

outlier_info = {}

for col in num_cols:
    Q1 = house_pricing1[col].quantile(0.25)
    Q3 = house_pricing1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    # boolean Series: True where value is outlier
    outliers_col = (house_pricing1[col] < lower) | (house_pricing1[col] > upper)

    outlier_info[col] = outliers_col.sum()  # how many outliers in this column

outlier_info


In [None]:
numerical_cols = [
    'Sale Price',
    'No of Bedrooms',
    'No of Bathrooms',
    'Flat Area (in Sqft)',
    'Lot Area (in Sqft)',
    'No of Floors',
    'Area of the House from Basement (in Sqft)',
    'Basement Area (in Sqft)',
    'Age of House (in Years)',
    'Renovated Year',
    'Lot Area after Renovation (in Sqft)'
]

plt.figure(figsize=(20, 15))
for i, col in enumerate(numerical_cols):
    plt.subplot(4, 3, i + 1)
    sns.boxplot(y=house_pricing[col])
    plt.title(col)
plt.tight_layout()
plt.show()

### **Outlier Removal using IQR**

In [None]:
for col in numerical_cols:
    Q1 = house_pricing[col].quantile(0.25)
    Q3 = house_pricing[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    house_pricing = house_pricing[(house_pricing[col] >= lower_bound) & (house_pricing[col] <= upper_bound)]

print(f"Shape of DataFrame after outlier removal: {house_pricing.shape}")

In [None]:
# Copy dataset
house_pricing1 = house_pricing.copy()

# Select numerical columns
num_cols = house_pricing1.select_dtypes(include=['int64', 'float64']).columns

# Dictionary to store outlier count
outlier_info = {}

for col in num_cols:
    Q1 = house_pricing1[col].quantile(0.25)
    Q3 = house_pricing1[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    # Outlier condition
    outliers = (house_pricing1[col] < lower) | (house_pricing1[col] > upper)

    # Count outliers
    outlier_info[col] = outliers.sum()

# Print result clearly
for col, count in outlier_info.items():
    if count > 0:
        print(f"{col} : {count}")
    else:
        print(f"{col}")


In [None]:
plt.figure(figsize=(20, 15))
for i, col in enumerate(numerical_cols):
    plt.subplot(4, 3, i + 1)
    sns.boxplot(y=house_pricing1[col])
    plt.title(col)
plt.tight_layout()
plt.show()


### **Scailing**

In [None]:
numerical_cols = house_pricing1.select_dtypes(include=['int64', 'float64']).columns
columns_to_scale = [col for col in numerical_cols if col not in ['Sale Price', 'ID']]

print("Columns to be scaled:")
print(columns_to_scale)

## Apply Standard Scaling



In [None]:
scaler = StandardScaler()
house_pricing1[columns_to_scale] = scaler.fit_transform(house_pricing1[columns_to_scale])
house_pricing.head()

### **Encoding**

In [None]:
house_pricing1.dtypes

In [None]:
house_pricing1 = pd.get_dummies(house_pricing1, columns=['Date House was Sold','Waterfront View', 'Condition of the House'], drop_first=True)
house_pricing1.head()

In [None]:
house_pricing1.dtypes

### **Train Test split**

In [None]:
X = house_pricing1.drop('Sale Price', axis=1)
y = house_pricing1['Sale Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")