In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# Load the dataset
# Assuming the dataset is a CSV file named 'house_prices.csv'
df = pd.read_csv('zameen-updated.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,property_id,location_id,page_url,property_type,price,location,city,province_name,latitude,longitude,baths,area,purpose,bedrooms,date_added,agency,agent,Area Type,Area Size,Area Category
0,237062,3325,https://www.zameen.com/Property/g_10_g_10_2_gr...,Flat,10000000,G-10,Islamabad,Islamabad Capital,33.67989,73.01264,2,4 Marla,For Sale,2,02-04-2019,,,Marla,4.0,0-5 Marla
1,346905,3236,https://www.zameen.com/Property/e_11_2_service...,Flat,6900000,E-11,Islamabad,Islamabad Capital,33.700993,72.971492,3,5.6 Marla,For Sale,3,05-04-2019,,,Marla,5.6,5-10 Marla
2,386513,764,https://www.zameen.com/Property/islamabad_g_15...,House,16500000,G-15,Islamabad,Islamabad Capital,33.631486,72.926559,6,8 Marla,For Sale,5,07-17-2019,,,Marla,8.0,5-10 Marla
3,656161,340,https://www.zameen.com/Property/islamabad_bani...,House,43500000,Bani Gala,Islamabad,Islamabad Capital,33.707573,73.151199,4,2 Kanal,For Sale,4,04-05-2019,,,Kanal,2.0,1-5 Kanal
4,841645,3226,https://www.zameen.com/Property/dha_valley_dha...,House,7000000,DHA Defence,Islamabad,Islamabad Capital,33.492591,73.301339,3,8 Marla,For Sale,3,07-10-2019,Easy Property,Muhammad Junaid Ceo Muhammad Shahid Director,Marla,8.0,5-10 Marla


In [8]:
# Display basic information about the dataset
df.info()

# Display basic statistics for numeric columns
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168446 entries, 0 to 168445
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   property_id    168446 non-null  int64  
 1   location_id    168446 non-null  int64  
 2   page_url       168446 non-null  object 
 3   property_type  168446 non-null  object 
 4   price          168446 non-null  int64  
 5   location       168446 non-null  object 
 6   city           168446 non-null  object 
 7   province_name  168446 non-null  object 
 8   latitude       168446 non-null  float64
 9   longitude      168446 non-null  float64
 10  baths          168446 non-null  int64  
 11  area           168446 non-null  object 
 12  purpose        168446 non-null  object 
 13  bedrooms       168446 non-null  int64  
 14  date_added     168446 non-null  object 
 15  agency         124375 non-null  object 
 16  agent          124374 non-null  object 
 17  Area Type      168446 non-nul

Unnamed: 0,property_id,location_id,price,latitude,longitude,baths,bedrooms,Area Size
count,168446.0,168446.0,168446.0,168446.0,168446.0,168446.0,168446.0,168446.0
mean,15596260.0,4375.936395,17765760.0,29.859519,71.239804,2.874227,3.179422,5.892188
std,2251207.0,3776.561581,35310030.0,3.80787,3.133042,2.4634,1.971401,5.778327
min,86575.0,1.0,0.0,11.052446,25.906027,0.0,0.0,0.0
25%,14883200.0,1058.0,175000.0,24.948536,67.130363,0.0,2.0,3.0
50%,16658510.0,3286.0,8500000.0,31.459784,73.056182,3.0,3.0,5.0
75%,17086620.0,7220.0,19500000.0,33.560887,73.25987,4.0,4.0,8.0
max,17357720.0,14220.0,2000000000.0,73.184088,80.16143,403.0,68.0,800.0


In [9]:
# 1. Data Cleaning

## Handling Missing Values
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)


Missing values in each column:
property_id          0
location_id          0
page_url             0
property_type        0
price                0
location             0
city                 0
province_name        0
latitude             0
longitude            0
baths                0
area                 0
purpose              0
bedrooms             0
date_added           0
agency           44071
agent            44072
Area Type            0
Area Size            0
Area Category        0
dtype: int64


In [None]:
# Handle missing values
# For simplicity, we'll use median for numeric columns and mode for categorical columns
df['price'].fillna(df['price'].median(), inplace=True)
df['baths'].fillna(df['baths'].median(), inplace=True)
df['area'].fillna(df['area'].median(), inplace=True)
df['bedrooms'].fillna(df['bedrooms'].median(), inplace=True)
df['location'].fillna(df['location'].mode()[0], inplace=True)
df['city'].fillna(df['city'].mode()[0], inplace=True)
df['province_name'].fillna(df['province_name'].mode()[0], inplace=True)
df['area_type'].fillna(df['area_type'].mode()[0], inplace=True)
df['area_size'].fillna(df['area_size'].mode()[0], inplace=True)
df['area_category'].fillna(df['area_category'].mode()[0], inplace=True)

## Handling Inconsistencies
# Check for inconsistent data
# For example, ensure 'price' and 'area' are positive values
df = df[df['price'] > 0]
df = df[df['area'] > 0]

In [None]:
## Handling Outliers
# Use Z-score method to identify outliers in 'price' and 'area'
from scipy import stats

# Calculate Z-scores
df['price_zscore'] = stats.zscore(df['price'])
df['area_zscore'] = stats.zscore(df['area'])

# Filter out rows with Z-score > 3 (i.e., outliers)
df = df[(df['price_zscore'].abs() <= 3) & (df['area_zscore'].abs() <= 3)]

# Drop the Z-score columns used for outlier detection
df.drop(['price_zscore', 'area_zscore'], axis=1, inplace=True)


In [None]:
# 2. Exploration and Visualization

## Explore Numeric Variables
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(df['area'], bins=50, kde=True)
plt.title('Distribution of House Areas')
plt.xlabel('Area')
plt.ylabel('Frequency')
plt.show()

In [None]:
## Explore Categorical Variables
plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='property_type', order=df['property_type'].value_counts().index)
plt.title('Count of Properties by Type')
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='province_name', order=df['province_name'].value_counts().index)
plt.title('Count of Properties by Province')
plt.xlabel('Province Name')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
## Explore Relationships
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='area', y='price', hue='property_type')
plt.title('Price vs. Area')
plt.xlabel('Area')
plt.ylabel('Price')
plt.legend(title='Property Type')
plt.show()

# Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_house_prices.csv', index=False)

In [None]:
# Save the cleaned dataset to a new CSV file
df.to_csv('zameen_updated1.csv', index=False)