# 1. Importing Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import ast
from scipy.stats import iqr,yeojohnson, skew, kurtosis
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objs as go

import missingno as msno

import regex as re
import eda_helper_functions

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth',None)

In [None]:
df = pd.read_csv('mg_cleaned_dataset.csv')


In [None]:
df.head()

In [None]:
df.shape

### observation 
- dataset consists of 11902 rows and 48 columns

In [None]:
#Columns in our dataset
df.columns

In [None]:
#Description of our dataset
df.describe().T

#T refers to transpose that displays the description of our dataset in long format.

In [None]:
df.skew(numeric_only=True)

In [None]:
#Let's look at the skewness of our dataset
df.skew(numeric_only=True)[df.skew(numeric_only=True) < -1].index


- Highly right-skewed (positively skewed): 'available_units', 'towers', 'parking', 'price', 'area', 'costpersqft', 'lattitude', 'emi', 'project_in_acres', 'assigned_flooring_score'
- Highly left-skewed (negatively skewed): 'locality_rating', 'longitude'

In [None]:
#Information of dataset
df.info()

In [None]:
len(df.select_dtypes(include='object').columns)

### observation
- Our dataset features consists of three datatypes
  - float
  - integer
  - object
- Of which total numerical features are 21
- And categorical features are 27.
- some features datatype is incorrect, data type should be 'object' for this features : amenities_cluster , flooring_cluster

In [None]:
# change feature datatypes as object : amenities_cluster , flooring_cluster
df['flooring_cluster'] = df['flooring_cluster'].astype(str)
df['amenities_cluster'] = df['amenities_cluster'].astype(str)

# observation 
- all properties are 'flats'
- no missing values 

# project_name      

In [None]:
df['project_name'].value_counts()

In [None]:
df['project_name'].value_counts().shape

In [None]:
#Let's apply the cumulative sum to check how much data comes from how many project_name.
df['project_name'].value_counts(normalize = True).cumsum()

In [None]:
df['project_name'].value_counts(normalize = True).cumsum().head(317)

In [None]:
society_counts = df['project_name'].value_counts()

# Frequency distribution for societies
frequency_bins = {
    "High (50-100)": int(((society_counts >= 50) & (society_counts <= 100)).sum()),
    "Average (10-49)": int(((society_counts >= 10) & (society_counts < 50)).sum()),
    "Low (2-9)": int(((society_counts > 1) & (society_counts < 10)).sum()),
    "Very Low (1)": int((society_counts == 1).sum())
}
frequency_bins

In [None]:
# top 10 socities
df['project_name'].value_counts().head(11).plot(kind='bar')

In [None]:
int(df['project_name'].isnull().sum())

#### Observation
- High cardinality feature 
- total 2943 unique project_name   
- The top 317 project_names have 50 percent of the preperties and the rest 50 percent of the properties come under the remaining 2626 project_names
- while doing the train test split ensure that the train-test split maintains the same proportion of high-volume and low-volume projects to prevent bias during training and testing. i.e do statified sampling
  - High (50-100): 11 societies have between 50 to 100 listings.
  - Average (10-49): 142 societies fall in this range with 10 to 49 listings each.
  - Low (2-9): 1011 societies have between 2 to 9 listings.
  - Very Low (1): A significant number, 1779 societies, have only 1 listing.
- Total 3503 missing values 

# price

In [None]:
df['price'].isnull().sum()

In [None]:
df['price'].describe()

In [None]:
sns.histplot(df['price'],kde=True,bins=50)

In [None]:
sns.boxplot(df['price'],color='lightgreen')
plt.grid()

### Observation on price

- Descriptive Statistics:

  - No missing values.
  - Mean Price: The average price is approximately 2.90 crores.
  - Median Price: The median (or 50th percentile) price is 1.78 crores.
  - The standard deviation is 4.24 Cr, indicating significant variation in prices.
  - Range: Prices range from a minimum of 0.01 crores to a maximum of 80 crores.
  - max price is 80 cr and 75 percentile is 3.20cr this clearly indicatie that there is outliers in the data
  - IQR: The interquartile range (difference between 75th and 25th percentile) is from 0.95 crores to 3.20 crores.

- Visualizations:
   - Distribution: The histogram indicates that most properties are priced in the lower range (below 5 crores), with a few properties going beyond 10 crores.
   - The histogram is right-skewed, indicating most prices are clustered near the lower end (closer to 0).
   - Box Plot: The box plot showcases the spread of the data and potential outliers. Properties priced above approximately 10 crores might be considered outliers as they lie beyond the upper whisker of the box plot.
   - majority of prices fall within the interquartile range (approximately 0.95–3.2 Cr), with a dense cluster near the median.

#### conclusion and solution:
  - The data has a highly skewed distribution, with a few very high-priced properties influencing the mean and standard deviation.
  - The median (1.78 Cr) is much lower than the mean (2.90 Cr), which further highlights the impact of these high-price outliers.
  - Outlier detection or transformation (e.g., log transformation) may be necessary for further analysis to normalize the data or reduce skewness.

In [None]:
# Skewness and Kurtosis
skewness = df['price'].skew()
kurtosis = df['price'].kurt()

print(skewness,kurtosis)

### Observations on Skewness and Kurtosis:

#### 1. **Skewness:**
   - The skewness value is **6.20**, which is highly positive.
   - A positive skewness indicates that the distribution is **right-skewed**:
     - Most values are concentrated on the lower end of the scale (closer to 0).
     - A few very high values (outliers) stretch the tail on the right side, as seen in the histogram and boxplot.

#### 2. **Kurtosis:**
   - The kurtosis value is **60.93**, which is extremely high.
   - High kurtosis indicates a **leptokurtic distribution**(leptokurtic = sample kurtosis -3 > 0 and in leptokurtic we have Heavy tails, sharp peak, more outliers than normal):
     - The distribution has heavy tails and a sharp peak around the mean.
     - This means there are many extreme values (outliers) compared to a normal distribution.

#### conclusion and solution  
The high skewness and kurtosis suggest that the `price` data is far from normally distributed.  
Analytical methods sensitive to non-normality may produce biased results.  
- **Potential Actions:**
- Consider **log transformation** or other scaling techniques to reduce skewness.
- Investigate the outliers for potential anomalies or domain-specific insights.
- Use non-parametric statistical methods (e.g., Mann-Whitney U, Spearman correlation) for hypothesis testing or correlations if normality cannot be assumed.

In [None]:
# Quantile Analysis
quantiles = df['price'].quantile([0.01, 0.05, 0.95, 0.99])

quantiles

#### Quantile Analysis:

 - 1% Quantile: Only 1% of properties are priced below 0.20 crores.
 - 5% Quantile: 5% of properties are priced below 0.36 crores.


In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(df['price'], 25)
Q3 = np.percentile(df['price'], 75)

# Calculate IQR
IQR = Q3 - Q1

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(lower_bound, upper_bound)

In [None]:
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
outliers.shape

In [None]:
outliers['price'].describe()

- Outliers Analysis (using IQR method):
  - Based on the IQR method, there are 989 properties considered as outliers.  
  - These outliers have an average price of approximately 13.04 crores.  
  - The range for these outliers is from 6.58 crores to 80 crores.  

In [None]:
# price binning
bins = [0, 1, 2, 3, 5, 10, 20, 50, 80]
bin_labels = ["0-1", "1-2", "2-3", "3-5", "5-10", "10-20", "20-50", "50-80"]
pd.cut(df['price'], bins=bins, labels=bin_labels, right=False).value_counts().sort_index().plot(kind='bar')

In [None]:
# ecdf plot(plotting cumulative percentages)
ecdf = df['price'].value_counts().sort_index().cumsum() / len(df['price'])
plt.plot(ecdf.index, ecdf, marker='.', linestyle='none')
plt.grid()

In [None]:
# Quantile Analysis
quantiles = df['price'].quantile([0.85,0.90])

quantiles

- 85% properties are below 4.577cr
- 90% properties are below 6cr 

In [None]:
#apply log transformation as it is right skewed data 

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.histplot(df['price'], kde=True, bins=50, color='skyblue')
plt.title('Distribution of Prices (Original)')
plt.xlabel('Price (in Crores)')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.histplot(np.log1p(df['price']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of Prices (Log Transformed)')
plt.xlabel('Log(Price)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

- we use log1p and it means we are doing this log(1+x) with each values 

### observation
- After applying the logarithmic transformation (log(price)), the distribution becomes closer to normal
- The log transformation reduces the impact of extreme values (outliers) and compresses the wide range of prices into a more manageable scale.

In [None]:
skewness = np.log1p(df['price']).skew()
kurtosis = np.log1p(df['price']).kurt()

print(skewness,kurtosis)

### Observation
- after log transformation skewness and kurtosis also get reduced 
- distribution is positively skewed (Acceptable range for near-normal data: −0.5 to 0.5. Beyond this, the data may not be considered symmetric.)
- Kurtosis<3: Light tails so it indicated that it is platykurtic. 
- kurtosis between 2 and 4 is often considered acceptable for near-normal distributions.

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['price'], color='skyblue')
plt.title('Distribution of Prices (Original)')
plt.xlabel('Price (in Crores)')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['price']), color='lightgreen')
plt.title('Distribution of Prices (Log Transformed)')
plt.xlabel('Log(Price)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### Observation

- The original data has a long tail, indicating high skewness. After the log transformation, the distribution becomes more compact and less skewed.
- After the log transformation, the data gets normalized, making the distribution more visible and interpretable.
- The log transformation brings outliers closer to the main data, reducing their impact.

# address region

In [None]:
# Count occurrences
counts = df['addressregion'].value_counts()

# Plot
plt.figure(figsize=(8, 5))
ax = sns.barplot(x=counts.index, y=counts.values, hue=counts.index, palette="tab10", legend=False)

# Labels
plt.xlabel("Address Region")
plt.ylabel("Count")
plt.title("Property Count per Region")
plt.xticks(rotation=30)
plt.show()

In [None]:
df_mumbai = df[df['addressregion'] == 'mumbai']

In [None]:
df_mumbai['price'].describe()

In [None]:
sns.histplot(df_mumbai['price'], kde=True, bins=50)

In [None]:
df_navimumbai = df[df['addressregion'] == 'navi mumbai']

In [None]:
df_navimumbai['price'].describe()

In [None]:
sns.histplot(df_navimumbai['price'], kde=True, bins=50)

In [None]:
df_thane = df[df['addressregion'] == 'thane']

In [None]:
df_thane['price'].describe()

In [None]:
sns.histplot(df_thane['price'], kde=True, bins=50)

In [None]:
df_palghar = df[df['addressregion'] == 'palghar']

In [None]:
df_palghar['price'].describe()

In [None]:
sns.histplot(df_palghar['price'], kde=True, bins=50)

In [None]:
df['addressregion'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

In [None]:
sns.boxplot(df_mumbai['price'],color='lightgreen')
plt.grid()

In [None]:
df_mumbai['price'].describe()

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(df_mumbai['price'], 25)
Q3 = np.percentile(df_mumbai['price'], 75)

# Calculate IQR
IQR = Q3 - Q1

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

In [None]:
sns.boxplot(df_navimumbai['price'],color='lightgreen')
plt.grid()

In [None]:
df_navimumbai['price'].describe()

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(df_navimumbai['price'], 25)
Q3 = np.percentile(df_navimumbai['price'], 75)

# Calculate IQR
IQR = Q3 - Q1

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

In [None]:
sns.boxplot(df_thane['price'],color='lightgreen')
plt.grid()

In [None]:
df_thane['price'].describe()

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(df_thane['price'], 25)
Q3 = np.percentile(df_thane['price'], 75)

# Calculate IQR
IQR = Q3 - Q1

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

In [None]:
sns.boxplot(df_palghar['price'],color='lightgreen')
plt.grid()

In [None]:
df_palghar['price'].describe()

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(df_palghar['price'], 25)
Q3 = np.percentile(df_palghar['price'], 75)

# Calculate IQR
IQR = Q3 - Q1

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

### observation

- mean and median of all regions is different,this means there is prices according to region
    - Mean/median Price:
    - mumbai - 3.55/2.20
    - navi mumbai - 1.26/0.92
    - thane - 1.40/1.18
    - palghar - 0.27/0.23

- mean and median of the palghar properties is almost same indicates that data distribution is relatively symmetric and there are fewer outliers
- most properties fall in this particular region
    - mumbai - 0 to 5 cr
    - navi mumbai and thane - 0 to 2 cr
    - palghar - 0.1 to 0.4 cr

- mumbai : standard deviation greater than the mean indicates high variation, meaning property prices in Mumbai are significantly diverse.
- palghar,thane and navi mumbai : standard deviation less than the mean indicates low variation, meaning property prices are more uniform and predictable.
- Mumbai appears to have more significant outliers on the higher end of the price scale compared to other regions, leading to a higher mean.
- All regions exhibit right-skewed distributions, indicating that most properties fall within lower price ranges, with a few high-priced properties pulling the mean upwards.

##### from box plot
 - In all regions outliers beyond the whiskers of the boxplots.
 - Mumbai: IQR = 1.25, indicating a higher spread in prices compared to other regions.
 - Navi Mumbai: IQR = 0.64, suggesting moderate variability.
 - Thane: IQR = 1.05, showing a higher variability than Navi Mumbai but less than Mumbai.
 - Palghar: IQR = 0.14, the lowest among all, indicating a smaller variation in property prices.
 - The boxplots indicate right-skewed distributions (positive skewness) for all regions, with long tails on the higher side.

#### conclusion

- most of the data is from mumbai region and it can create bias
- mumbai property prices higher than other regions
- prices are according to the region
- expensiveness is like mumbai > thane > navi mumbai > palghar

### Solution:
1. **Address Bias**:
   - Use stratified sampling to ensure balanced representation across regions for analysis.
   - Normalize or standardize property prices to reduce the impact of Mumbai's dominance.

2. **Separate Regional Analysis**:
   - Analyze each region independently to derive region-specific insights.
   - Create separate predictive models or pricing strategies for each region.

3. **Outlier Handling**:
   - Remove or cap extreme outliers to minimize their impact on mean calculations.
   - Focus on the median for central tendency, as it is more robust to outliers.

# costpersqft

In [None]:
df['costpersqft'].head()

In [None]:
df['costpersqft'].dtype

In [None]:
#no of missing values in costpersqft column
int(df['costpersqft'].isna().sum())

In [None]:
df['costpersqft'].describe()

- range of the sqft ranges from 147.00 sqft to  5333333 sqft
- mean is 28263 and median is 25577

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Example: Replace `costpersqft` with your column
plt.figure(figsize=(8, 6))
df['costpersqft'].plot(kind='hist', bins=10, edgecolor='black')

# Set standard x-axis format
plt.ticklabel_format(style='plain', axis='x')

# Add labels and title
plt.xlabel('price_per_sqft')
plt.ylabel('Count')
plt.title('Distribution of Cost per Square Foot')

plt.show()


In [None]:
# Skewness and Kurtosis
skewness = df['costpersqft'].skew()
kurtosis = df['costpersqft'].kurt()

print(skewness,kurtosis)

#### Observation

- there are few outliers which affect the whole distribution
- A positive skewness value (like 92.57) suggests a heavily right-skewed distribution, meaning there are extreme high values in the dataset pulling the tail to the right.
- A high kurtosis value (like 9547.57) indicates that the distribution has heavy tails and possibly significant outliers.

In [None]:
#drop outliers
df = df.drop(df[df['costpersqft'].isin([5333333, 208333, 344262, 270543])].index)


In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['costpersqft']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of costpersqft (Log Transformed)')
plt.xlabel('costpersqft')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
skewness = np.log1p(df['costpersqft']).skew()
kurtosis = np.log1p(df['costpersqft']).kurt()

print(skewness,kurtosis)

### observation

- A skewness of -0.744 suggests that the data is moderately negatively skewed after the log transformation.
- This means the tail on the left side of the distribution (smaller values) is slightly longer than the right side.
- Log transformation has reduced the skewness compared to the original data, which is a common transformation for positively skewed distributions.
- A kurtosis value of 1.366 indicates the distribution is light-tailed compared to a normal distribution (which has a kurtosis of 3).
- This suggests fewer outliers in the transformed data, implying a flatter peak and thinner tails than the normal distribution.

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['costpersqft'], color='skyblue')
plt.title('Distribution of costpersqft (Original)')
plt.xlabel('costpersqft')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['costpersqft']), color='lightgreen')
plt.title('Distribution of costpersqft (Log Transformed)')
plt.xlabel('Log(costpersqft)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Drop NaN values from the column
cleaned_data = df['costpersqft'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data, 25)
Q3 = np.percentile(cleaned_data, 75)

# Calculate IQR
IQR = Q3 - Q1

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")


In [None]:
# Quantile Analysis
quantiles = df['costpersqft'].quantile([0.85,0.95])

quantiles

### Observation
- The boxplot(left) shows significant outliers (data points far above the upper whisker)
- The log transformation(right) reduced the skewness significantly, making the distribution more symmetric.
- However, some outliers are still present, though their effect on the overall distribution has been minimized.

# bed

In [None]:
df['bed'].head()

In [None]:
df['bed'].value_counts()

In [None]:
df['bed'].isna().sum()

In [None]:
df['bed'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['bed'].value_counts(normalize=True).head().plot(kind='pie',autopct='%0.2f%%')

In [None]:
df['bed'].value_counts(normalize=True).cumsum()

### obserbvation 

- no missing values
- 92% of the properties have 1, 2, or 3 bedrooms.

# bath

In [None]:
df['bath'].head()

In [None]:
df['bath'].value_counts()

In [None]:
df['bath'].isna().sum()

In [None]:
df['bath'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['bath'].value_counts(normalize=True).head().plot(kind='pie',autopct='%0.2f%%')

In [None]:
df['bath'].value_counts(normalize=True).cumsum()

### observation

- 30 missing values
- 99% of the properties have 1,2,3,4 and 5 bathrooms.
- 55% properties have 2 bathrooms

# balcony

In [None]:
df['balcony'].head()

In [None]:
df['balcony'].value_counts()

In [None]:
df['balcony'].isnull().sum()

In [None]:
df['balcony'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['balcony'].value_counts(normalize=True).head().plot(kind='pie',autopct='%0.2f%%')

In [None]:
df['balcony'].value_counts(normalize=True).cumsum()

### observation
- 5826 missing values
- 99% of the properties have 1,2,3,4 and 5 balconies.
- 50% properties have 1 balcony 

# parking

In [None]:
df['parking'].head()

In [None]:
df['parking'].value_counts()

In [None]:
df['parking'].isna().sum()

In [None]:
df['parking'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['parking'].value_counts(normalize=True).head().plot(kind='pie',autopct='%0.2f%%')

### observation
- 6237 missing values
- 70% of properties have only 1 parking followed by 23.89% properties have 2 parking
- feature engineering of the categories which has value_counts less than 7

# available_units

In [None]:
df['available_units'].head()

In [None]:
df['available_units'].describe()

In [None]:
df['available_units'].isna().sum()

In [None]:
sns.histplot(df['available_units'],kde=True,bins=50)

In [None]:
sns.boxplot(df['available_units'],color='lightgreen')
plt.grid()

In [None]:
# Skewness and Kurtosis
skewness = df['available_units'].skew()
kurtosis = df['available_units'].kurt()

print(skewness,kurtosis)

#### Observation


- there are few outliers which affect the whole distribution
- A positive skewness value (like 25.46) suggests a heavily right-skewed distribution, meaning there are extreme high values in the dataset pulling the tail to the right.
- A high kurtosis value (like 850.87) indicates that the distribution has heavy tails and possibly significant outliers.

In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['available_units']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of available_units (Log Transformed)')
plt.xlabel('available_units')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
skewness = np.log1p(df['available_units']).skew()
kurtosis = np.log1p(df['available_units']).kurt()

print(skewness,kurtosis)

### observation

- 5226 missing values
- A skewness of -0.166 indicates that the data is nearly symmetric after the log transformation.
- A kurtosis of 0.030 is very close to 0, indicating that the distribution is neither heavily tailed nor strongly peaked.

# tower

In [None]:
df['towers'].head()

In [None]:
df['towers'].value_counts()

In [None]:
df['towers'].isna().sum()

In [None]:
sns.histplot(df['towers'], kde=True, bins =50)

In [None]:
df['towers'].value_counts().sort_index().plot(kind='bar')

### observation

- 5034 missing values
- most of the builders make less than 20 towers
- The highest frequency is observed for lower values of towers (e.g., 1, 2, or 3 towers).
- As the number of towers increases, the frequency decreases significantly.
- The distribution is right-skewed.

In [None]:
sns.boxplot(df['towers'] , color = 'lightgreen' )
plt.grid()

In [None]:
#skewness and kurtosis
skewness = df['towers'].skew()
kurtosis = df['towers'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### Observation

- there are few outliers which affect the whole distribution
- A positive skewness value (like 8.47) suggests a heavily right-skewed distribution, meaning there are extreme high values in the dataset pulling the tail to the right.
- A high kurtosis value (like 141.86) indicates that the distribution has heavy tails and possibly significant outliers.

# locality_rank

In [None]:
df['locality_rank'].head()

In [None]:
df['locality_rank'].value_counts()

In [None]:
df['locality_rank'].isna().sum()

In [None]:
df['locality_rank'].describe()

### observation
- 3839 missing values
- Min rank is 1, max is 839.  
- Mean rank is 122.38, median is 92 (right-skewed).  
- 25% of localities have ranks ≤ 16; 50% ≤ 92.  
- Standard deviation is 126.50; high variability.  
- Data covers 1–839 ranks out of 3015 total ranks.  

In [None]:
sns.histplot(df['locality_rank'],kde=True,bins=50)

### observation

- The distribution is right-skewed, with most localities having lower ranks.
- A significant number of localities have ranks close to 1–50.
- Ranks above 200 are relatively sparse, with a gradual decline towards the maximum rank (839).
- The highest frequency is observed for ranks near 1.
- The presence of multiple peaks indicates possible clustering in locality ranks.

In [None]:
sns.boxplot(df['locality_rank'],color='lightgreen')
plt.grid()

### observations
- The IQR (green box) lies between ranks 16 and 186.
- The median rank is approximately 92.
- Ranks above 400 are outliers, with a few reaching the maximum rank (839).
- The distribution is right-skewed, as evident from the longer whisker on the upper side.
- Most data points are concentrated in the lower rank range (1–200).

In [None]:
#skewness and kurtosis
skewness = df['locality_rank'].skew()
kurtosis = df['locality_rank'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

- Skewness (1.398): The data is positively skewed, with a longer tail on the right.  
- Most locality ranks are concentrated at lower values.
- Kurtosis (1.997): The distribution has light tails compared to a normal distribution (kurtosis < 3).  
- Indicates fewer extreme outliers than a heavy-tailed distribution.  

In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['locality_rank']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of locality_rank (Log Transformed)')
plt.xlabel('locality_rank')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observations 
- The log transformation reduces skewness, making the distribution more symmetric.
- Peaks are observed around log values 2–5, indicating clustering in these ranges.
- Outliers in the original data are compressed, improving interpretability.
- The spread of ranks is now more balanced, highlighting patterns in the middle range.
- The transformation emphasizes the lower ranks, previously overshadowed by the skewness.

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['locality_rank'], color='skyblue')
plt.title('Distribution of locality_rank (Original)')
plt.xlabel('locality_rank')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['locality_rank']), color='lightgreen')
plt.title('Distribution of locality_rank (Log Transformed)')
plt.xlabel('Log(locality_rank)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation

- The data shows a right-skewed distribution with many outliers above the upper whisker.
- Most values are concentrated in the lower range.
- The logarithmic transformation reduces skewness and compresses extreme values, resulting in a more symmetric distribution with fewer visible outliers.

In [None]:
skewness = np.log1p(df['locality_rank']).skew()
kurtosis = np.log1p(df['locality_rank']).kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation

- A skewness value close to 0 indicates that the data is nearly symmetric.
- The slight negative value suggests a minor left-skewed distribution.
- A negative kurtosis indicates that the distribution is flatter than a normal distribution (platykurtic), with lighter tails and fewer extreme outliers.

# locality_url_rating

In [None]:
df['locality_rating'].head()

In [None]:
df['locality_rating'].isna().sum()

In [None]:
df['locality_rating'].value_counts()

In [None]:
df['locality_rating'].describe()

### observation
- 467 missing values
- Min locality_rating is 0, max is 5.  
- Mean locality_rating is 3.75, median is 4
- 25% of locality_rating have rating ≤ 3.90; 50% ≤ 4.0.  
- Standard deviation is 1.06; indicating moderate to high variability in ratings.
- Data covers 0–5 continuous rating out of 5 rating.  

In [None]:
df['locality_rating'].value_counts().sort_index().plot(kind='bar')

### observation

- The ratings are concentrated between 3.8 and 4.3, with a peak at 4.0-4.1.
- The distribution is slightly left-skewed, indicating more high ratings than low.
- Outliers at extremes (e.g., 0.0, 1.0, >4.8) are minimal and infrequent.

In [None]:
sns.boxplot(df['locality_rating'],color='lightgreen')
plt.grid()

In [None]:
# Drop NaN values from the column
cleaned_data_locality_rating = df['locality_rating'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_locality_rating, 25)
Q3 = np.percentile(cleaned_data_locality_rating, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
# Quantile Analysis
quantiles = df['locality_rating'].quantile([0.30,0.95])

quantiles

### observation 

- the iqr green box lies between 3.90 and 4(50% data)
- the median rank is 4.00
- rating below 3.5 and above 4.65 are outliers
- iqr 0.3 indicate middle 50% of the ratings are tightly clustered, suggesting low variability in this range.
- The distribution is left-skewed, as slightly longer whisker from the lower side

In [None]:
#skewness and kurtosis
skewness = df['locality_rating'].skew()
kurtosis = df['locality_rating'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation
- -3.03 negative skewness indicates strong left-skewed distribution caused by small number of low ratings pulling the tail to the left
- kurtosis > 3 leptokurtic distribution. This reflects a sharp peak and heavy tails, meaning there are more extreme values (outliers) compared to a normal distribution.
- try to apply transformations like reglection transform ,power transform,Box-Cox or Yeo-Johnson Transform

# construction

In [None]:
df['construction'].head()

In [None]:
df['construction'].isna().sum()

In [None]:
df['construction'].value_counts()

In [None]:
eda_helper_functions.cat_univar_plots(df, "construction")

In [None]:
df['construction'].notna().sum()

In [None]:
df[df['construction'] == 'less than 5 years'].shape

In [None]:
df[df['construction'] == 'under construction'].shape

In [None]:
df[df['construction'] == 'new construction'].shape

In [None]:
df[df['construction'] == '5 to 10 years'].shape[0] + df[df['construction'] == '10 to 15 years'].shape[0] +df[df['construction'] == 'above 20 years'].shape[0] + df[df['construction'] == '15 to 20 years'].shape[0]

### observation 
- 1063 missing values
- 3302 properties are older more than 5 years
- 1508 properties older less than 5 years
- 4558 are under construction properties
- 1467 are new construction properties
- 42% properties are under construction and all other are completed properties
- make above 10 years as separate category 

# overlooking 

In [None]:
df['overlooking'].head()

In [None]:
df['overlooking'].isna().sum()

In [None]:
df['overlooking'].value_counts()

In [None]:
# Remove the phrase 'not available' from the 'overlooking' column
df['overlooking'] = df['overlooking'].str.replace(',? *not available', '', regex=True)

In [None]:
df['overlooking'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['overlooking'].value_counts(normalize=True).head(7).plot(kind='pie',autopct='%0.2f%%')

### observation
- 3969 missing values
- main road and garden/park are the most common "overlooking" categories
- make 5th category as other(pool) which can have this combination garden/park, pool or main road, pool

# ownership

In [None]:
df['ownership'].head()

In [None]:
df['ownership'].value_counts()

In [None]:
df['ownership'].isna().sum()

In [None]:
df['ownership'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['ownership'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

- freehold - owner has complete ownership of both the building and the land it is built on.
- leasehold - buyer owns the building but not the land it is built on.the land is leased from a landlord (often referred to as the freeholder) for a specific period, typically ranging from 30 to 99 years or more.
- Co-operative society in property ownership refers to a housing arrangement where the property (e.g., an apartment building) is owned and managed by a co-operative society
- Power of Attorney (PoA) in property allows a person (the principal) to authorize another (the agent) to manage, sell, lease, or transfer property on their behalf, without granting ownership.

### observation

- 3611 missing values
- most of the properties are from freehold properties 71.45% followed by the co-op society 

# extra_rooms

In [None]:
df['extra_rooms'].head()

In [None]:
df['extra_rooms'].value_counts()

In [None]:
df['extra_rooms'].isna().sum()

In [None]:
df['extra_rooms'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['extra_rooms'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

In [None]:
import pandas as pd

# Sample data
data = {
    'rooms': [
        'none of these', 'store', 'puja', 'puja, servant, store, study', 'study',
        'servant', 'puja, store', 'puja, store, study', 'store, study', 'servant, store',
        'puja, study', 'puja, servant', 'puja, servant, store', 'puja, servant, study',
        'servant, store, study', 'servant, study'
    ],
    'count': [
        3005, 1112, 743, 316, 309, 296, 229, 132, 89, 86, 77, 63, 56, 32, 29, 24
    ]
}

df1 = pd.DataFrame(data)

# Splitting each 'rooms' entry into individual rooms and summing the counts
room_counts = {}

for index, row in df1.iterrows():
    rooms = row['rooms'].split(', ')
    count = row['count']
    
    for room in rooms:
        if room not in room_counts:
            room_counts[room] = 0
        room_counts[room] += count

# Displaying the final counts for each room
room_counts_df = pd.DataFrame(list(room_counts.items()), columns=['room', 'total_count']).sort_values(by='total_count', ascending=False)
print(room_counts_df)


### observation
- 5300 missing values
- 45.54% properties dont have extra rooms
- most properties have store room followed by puja,study,servant
- feature engineering for this column eg: puja,servant so consider this as 2 rooms 

# builder

In [None]:
df['builder'].head()

In [None]:
df['builder'].value_counts()

In [None]:
df['builder'].value_counts().shape

In [None]:
df['builder'].isna().sum()

In [None]:
#Let's apply the cumulative sum to check how much data comes from how many project_name.
df['builder'].value_counts(normalize = True).cumsum()

In [None]:
df['builder'].value_counts(normalize = True).cumsum().head(49)

In [None]:
project_counts = df['builder'].value_counts()

# Frequency distribution for societies
frequency_bins = {
    "High (50-100)": int(((project_counts >= 50) & (project_counts <= 100)).sum()),
    "Average (10-49)": int(((project_counts >= 10) & (project_counts < 50)).sum()),
    "Low (2-9)": int(((project_counts > 1) & (project_counts < 10)).sum()),
    "Very Low (1)": int((project_counts == 1).sum())
}
frequency_bins

In [None]:
# top 10 socities
df['builder'].value_counts().head(11).plot(kind='bar')

### Observation
- High cardinality feature 
- total 1146 unique builders   
- The top 49 builders have 50 percent of the projects and the rest 50 percent of the properties come under the remaining 1097 builders
- while doing the train test split ensure that the train-test split maintains the same proportion of high-volume and low-volume projects to prevent bias during training and testing. i.e do statified sampling
  - High (50-100): 12 builders have done 50 to 100 projects 
  - Average (10-49): 124 builders have done 10 to 149 projects 
  - Low (2-9): 499 builders have done 2 to 9 projects 
  - Very Low (1): 501 builders have done only 1 project
- Total 4281 missing values 

# furnish 

In [None]:
df['furnish'].head()

In [None]:
df['furnish'].value_counts()

In [None]:
df['furnish'].isna().sum()

In [None]:
df['furnish'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['furnish'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

### observation 
- 39 missing values
- unfurnished(63.10%) properties are more followed by semi-furnished(26.99%) and furnished(9.91%)

# area

In [None]:
df['area'].head()

In [None]:
df['area'].isna().sum()

In [None]:
df['area'].value_counts()

In [None]:
df['area'].describe()

### observation 
- 5 missing values
- min area is 100 and max is 40852
- mean area is 879.45 and median is 730
- 75% properties have area less than 1036 and max area is 40852
- Standard deviation is 697.36; high variability. 

In [None]:
sns.histplot(df['area'],kde=True,bins=50)

### observation 
- The distribution is right-skewed, with most properties having lower area.
- A significant number of localities have area close to 1–1500.

In [None]:
sns.boxplot(df['area'],color='lightgreen')
plt.grid()

### observation 
- only 1 outlier make the distribution heavily right-skewed
- the IOR(green box) lies between area 540 to 1036
- the median area is 730
- area above 1780 are outliers, with few reaching the max area 40852
- The distribution is right-skewed, as evident from the longer whisker on the upper side.
- Most data points are concentrated in the lower area range (1–1500).

In [None]:
#skewness and kurtosis
skewness = df['area'].skew()
kurtosis = df['area'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness (18.55):  
  - Highly positively skewed.  
  - Indicates extreme values (outliers) on the higher end.  
- Kurtosis (927.63):  
  - Extremely high kurtosis.  
  - Suggests heavy tails with many extreme outliers.
  - leptokurtic distribution (high peak and heavy tails).
- Next Steps:  
  - Apply transformations (e.g., log or Box-Cox) to reduce skewness.  
  - Investigate and handle outliers to improve data quality.  

In [None]:
# Drop NaN values from the column
cleaned_data_area = df['area'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_area, 25)
Q3 = np.percentile(cleaned_data_area, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
# Quantile Analysis
quantiles = df['area'].quantile([0.75,0.90,0.95,0.98,0.99,1.00])

quantiles

### observation 
- large jump between the 99th and 100th percentiles suggests extreme outliers in the data.

In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['area']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of area (Log Transformed)')
plt.xlabel('area')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation
- The distribution is roughly bell-shaped, centered around log-transformed values of 6.5–7.
- Most data points lie between 5.5 and 8, with a slight right-skew.
- The highest frequency bin has around 1400 observations.
- The KDE curve fits the histogram well, indicating a smooth distribution.
- A few outliers are present in the right tail (log-transformed values > 9).

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['area'], color='skyblue')
plt.title('Distribution of area (Original)')
plt.xlabel('area')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['area']), color='lightgreen')
plt.title('Distribution of area (Log Transformed)')
plt.xlabel('Log(area)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation

- Original Data:
  - The area data is highly skewed with extreme outliers visible at the upper end.
  - Most values are clustered near the lower range.
- Log-Transformed Data:
  - The log transformation reduces skewness, condensing the range of values.
  - Outliers are still present but are less extreme and more interpretable.
  - The interquartile range (IQR) is more centralized, making the distribution more symmetric.

In [None]:
skewness = np.log1p(df['area']).skew()
kurtosis = np.log1p(df['area']).kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation
- after log transformation skewness and kurtosis also get reduced
- A skewness value close to 0 indicates that the data is nearly symmetric.
- Close to normal distribution kurtosis less than 3, showing fewer extreme outliers

# property_type

In [None]:
df['property_type'].head()

In [None]:
df['property_type'].isna().sum()

In [None]:
df['property_type'].value_counts()

In [None]:
df['property_type'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['property_type'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

### observation 
- 3 missing values 
- 56.50% data is resale
- 43.43% data is new property
- and remaining 0.069% is other,nan and rent properties 

# status

In [None]:
df['status'].head()

In [None]:
df['status'].value_counts()

In [None]:
df['status'].isna().sum()

In [None]:
df['status'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['status'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

### observation 
- 83 missing values
- 61.39% are ready to move properties
- 38.58% under construction properties
- 0.030 are ongoing,immediatly and nan properties 

# lift 

In [None]:
df['lift'].head()

In [None]:
df['lift'].value_counts()

In [None]:
df['lift'].isna().sum()

In [None]:
df['lift'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['lift'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

In [None]:
df['lift'].value_counts(normalize=True).cumsum()

### observation
- 5444 missing values 
- 36.97% properties have 2 lifts
- 84% properties have less than 4 lifts
- very few properties have more than 4 lifts
- no of lifts are ranges from 1 to 10

# flat_on_floor

In [None]:
df['flat_on_floor'].head()

In [None]:
df['flat_on_floor'].isna().sum()

In [None]:
df['flat_on_floor'].value_counts()

In [None]:
df['flat_on_floor'].describe()

### observation 
- 2292 missing values
- min is -2 and max is 75(below ground there is -1 and -2 and ground floor is 0 and so on till 75th floor we have properties)
- mean is 10.86 and median is 8
- 75% properties are on below 15th floor and max is 75
- std dev is 10.13; high variability

In [None]:
df['flat_on_floor'].value_counts(normalize=True).cumsum()

### observation 
- 90% properties are on this floor - 0 to 23 and 25 

In [None]:
sns.histplot(df['flat_on_floor'],kde=True,bins=50)

### observation 
- Most flats are concentrated on lower floors (near 0–10).
- The distribution is right-skewed, with fewer flats on higher floors.
- There is a sharp peak around floor 1, indicating a large number of flats on the first floor.
- The data has outliers on higher floors.

In [None]:
sns.boxplot(df['flat_on_floor'],color='lightgreen')
plt.grid()

### Observations:
- most flats are located on floors within a range of approximately 0–15.
- There are significant outliers above the upper whisker, indicating some flats are on very high floors.
- The interquartile range (IQR) is between floor 4 and floor 15.
- The median floor level is around 8.
- presence of outliers in higher floors.

In [None]:
#skewness and kurtosis
skewness = df['flat_on_floor'].skew()
kurtosis = df['flat_on_floor'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### Observations:  
- Skewness (1.86): The data is moderately right-skewed, indicating a longer tail towards higher floor levels.  
- Kurtosis (4.73): The data has higher peakedness compared to a normal distribution, with significant outliers on higher floors.  

In [None]:
# Drop NaN values from the column
cleaned_data_flat_on_floor = df['flat_on_floor'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_flat_on_floor, 25)
Q3 = np.percentile(cleaned_data_flat_on_floor, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
def signed_log1p(x):
    return np.sign(x) * np.log1p(abs(x))

In [None]:
transformed_flat_on_floor = df['flat_on_floor'].apply(signed_log1p)

A signed log transformation is a variation of the regular logarithmic transformation that allows us to handle both positive and negative values. It is typically used when we want to preserve the sign (positive or negative) of the values while applying a log-like transformation.

In [None]:
# Distribution plot with log transformation
sns.histplot(transformed_flat_on_floor,kde=True,bins=50,color='lightgreen')
plt.title('Distribution of flat_on_floor (Log Transformed)')
plt.xlabel('flat_on_floor')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
- Distribution is more symmetric, with reduced skewness.  
- Negative values handled effectively.  
- Outliers impact reduced.  
- Compressed range for better interpretation.  

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['flat_on_floor'], color='skyblue')
plt.title('Distribution of flat_on_floor (Original)')
plt.xlabel('flat_on_floor')
plt.ylabel('Frequency')

# Distribution plot with signed log transformation
plt.subplot(1, 2, 2)
sns.boxplot(transformed_flat_on_floor, color='lightgreen')
plt.title('Distribution of flat_on_floor (Signed Log Transformed)')
plt.xlabel('Signed Log(flat_on_floor)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


### observation 

- Left Plot (Original):  
  - Highly skewed distribution.  
  - Many upper outliers.  
  - Small interquartile range (IQR).  
- Right Plot (Signed Log Transformed):  
  - Reduced skewness, more symmetric.  
  - Fewer outliers, compressed range.  
  - Suitable for analyses requiring normality.  

In [None]:
skewness = transformed_flat_on_floor.skew()
kurtosis = transformed_flat_on_floor.kurt()
print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")


### observation:  
- Skewness: -0.556 (slightly negatively skewed, close to symmetric).  
- Kurtosis: 0.597 (close to normal kurtosis, indicating a moderate tail thickness).  
These values suggest the transformed data is relatively well-behaved for statistical analysis.

# total_floor

In [None]:
df['total_floor'].head()

In [None]:
df['total_floor'].value_counts()

In [None]:
df['total_floor'].isna().sum()

In [None]:
df['total_floor'].describe()

### observation 
- column for how many floor building has 
- 2059 missing values
- min is 1 and max is 92
- mean is 22.18 and median is 19
- 75% properties are on below 30th floor and max is 92
- std dev is 16.21; high variability

In [None]:
df['total_floor'].value_counts(normalize=True).cumsum()

In [None]:
# Compute cumulative sum of normalized value counts
cumulative_sum = df['total_floor'].value_counts(normalize=True).cumsum()

# Filter values where cumulative sum is <= 90%
filtered_numbers = cumulative_sum[cumulative_sum <= 0.9].index.tolist()

# Sort the filtered numbers
sorted_numbers = sorted(filtered_numbers)

print(sorted_numbers)

### observation 
- 90% of the projects consist of buildings with 3 to 40 floors, along with specific buildings having 42, 45, 50, and 60 floors.

In [None]:
sns.histplot(df['total_floor'],kde=True,bins=50)

### observation 
- Most projects are concentrated on lower floors (near 1–25).
- The distribution is right-skewed, with fewer projects are of higher floors.
- There is a sharp peak around 7 floor building, indicating most of the buildings are of 7 floor
- The data has outliers for higher floors.

In [None]:
sns.boxplot(df['total_floor'],color='lightgreen')
plt.grid()

### Observations:
- most projects are located on floors within a range of approximately 0–30.
- There are significant outliers above the upper whisker, indicating some flats are of many floors.
- The interquartile range (IQR) is between floor 8 and floor 30.
- The median project floor level is around 19.
- presence of outliers in higher flat floors.

In [None]:
#skewness and kurtosis
skewness = df['total_floor'].skew()
kurtosis = df['total_floor'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### Observations:

- Skewness: 1.1165:  
  - The data is moderately positively skewed, indicating that more projects are concentrated at lower floor levels, with a few extreme values on higher floors causing a longer right tail.  
- Kurtosis: 0.9350  
  - The data is platykurtic (kurtosis < 3), with lighter tails and fewer extreme outliers.  

In [None]:
# Drop NaN values from the column
cleaned_data_total_floor = df['total_floor'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_total_floor, 25)
Q3 = np.percentile(cleaned_data_total_floor, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
# Quantile Analysis
quantiles = df['total_floor'].quantile([0.10,0.75,0.80,0.90,0.95,0.98,0.99,1.00])

quantiles

### observation 
- 80% of the projects have less than 35 floors

In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['total_floor']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of total_floor (Log Transformed)')
plt.xlabel('total_floor')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
- Distribution is more symmetric, with reduced skewness.  
- Negative values handled effectively.  
- Outliers impact reduced.  
- Compressed range for better interpretation.  

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['total_floor'], color='skyblue')
plt.title('Distribution of total_floor (Original)')
plt.xlabel('total_floor')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['total_floor']), color='lightgreen')
plt.title('Distribution of total_floor (Log Transformed)')
plt.xlabel('Log(total_floor)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 

- Left Plot (Original):  
  - Highly skewed distribution.  
  - Many upper outliers.   
- Right Plot:  
  - Reduced skewness, more symmetric.  
  - Fewer outliers, compressed range.  
  - Suitable for analyses requiring normality.  

In [None]:
skewness = np.log1p(df['total_floor']).skew()
kurtosis = np.log1p(df['total_floor']).kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation:

- Skewness (-0.25): Data is slightly negatively skewed, with a minor left tail, close to symmetric.  
- Kurtosis (-0.73): Platykurtic distribution with flatter peaks and lighter tails.  

# facing

In [None]:
df['facing'].head()

In [None]:
df['facing'].value_counts()

In [None]:
df['facing'].isna().sum()

In [None]:
df['facing'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['facing'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

### observation 
- 3738 missing values
- 58.04% properties have east facing followed by north-est(15.49%) and west(11.23%)
- all other facing are less than 10%

# lattitude 

In [None]:
df['lattitude'].head()

In [None]:
df['lattitude'].value_counts()

In [None]:
df['lattitude'].isna().sum()

In [None]:
df['lattitude'].describe()

### observation 
- 466 missing values
- 19.15, close to the median, indicating data is symmetrically distributed.
- std-dev 0.735, showing low variability within most of the data.
- 9.21, indicating a small subset of data significantly lower than the mean.
- 72.88, showing extreme outliers far above the upper whisker.
- 19.06, tightly packed with the 75th Percentile (Q3): 19.21, confirming low IQR.
- Data is heavily skewed due to the extreme max value.

In [None]:
sns.histplot(df['lattitude'],kde=True,bins=50)

### observation
- The latitude distribution is highly concentrated between 19 and 20.  
- The data is heavily skewed, with minimal values beyond 20.  
- Indicates the dataset is focused on a specific geographical region.

In [None]:
sns.boxplot(df['lattitude'],color='lightgreen')
plt.grid()

### observation 
- Values outside the whiskers are considered outliers.
- The latitude data is concentrated within a narrow interquartile range (IQR) near 10–20.
- The median latitude lies close to 19.

In [None]:
#skewness and kurtosis
skewness = df['lattitude'].skew()
kurtosis = df['lattitude'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness (67.71): Extremely positively skewed distribution with a long right tail.  
- Kurtosis (4986.76): Highly leptokurtic, with a sharp peak and heavy tails.  
- Most values are concentrated between 10°–20°, with a few extreme outliers above 50°.  
- Indicates the need for transformations (e.g., log, Box-Cox) or outlier treatment.

In [None]:
# Drop NaN values from the column
cleaned_data_lattitude = df['lattitude'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_lattitude, 25)
Q3 = np.percentile(cleaned_data_lattitude, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


# longitude 

In [None]:
df['longitude'].head()

In [None]:
df['longitude'].value_counts()

In [None]:
df['longitude'].isna().sum()

In [None]:
df['longitude'].describe()

### observation 
- mean close to median indicating data is symmetrically distributed
- std-dev 0.716, showing relatively low variability within the dataset.
- 19.02, an extreme outlier far below the majority of values.
- 74.02, close to the mean, suggesting no extreme positive outliers.
- 25th Percentile (Q1): 72.84 and 75th Percentile (Q3): 72.96, with a narrow IQR of approximately 0.12, indicating tightly packed data.
- The outlier at the lower bound (19.02) significantly skews the distribution.

In [None]:
sns.histplot(df['longitude'],kde=True,bins=50)

### observation 
- Majority of longitude values are concentrated near 70.
- Sparse distribution of values below 70.
- Outliers or minimal data points in lower longitude ranges.

In [None]:
sns.boxplot(df['longitude'],color='lightgreen')
plt.grid()

### observation 
- Most longitude values lie within a narrow range near the upper end (around 70).
- A few outliers are present, significantly lower than the main cluster (around 20).

In [None]:
#skewness and kurtosis
skewness = df['longitude'].skew()
kurtosis = df['longitude'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness (-73.89):  
  - The negative skewness indicates a strong left-skewed distribution.
  - This aligns with the presence of outliers at the lower longitude range, as seen in the boxplot.
- Kurtosis (5543.30):
  - The extremely high kurtosis value suggests the distribution has heavy tails and a sharp peak.
  - This indicates most of the data is tightly concentrated near the central value (around 70), with a few extreme outliers contributing to the heavy tails.
- Indicates the need for transformations (e.g., log, Box-Cox) or outlier treatment.

In [None]:
# Drop NaN values from the column
cleaned_data_longitude = df['longitude'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_longitude, 25)
Q3 = np.percentile(cleaned_data_longitude, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


# seller

In [None]:
df['seller'].head()

In [None]:
df['seller'].value_counts()

In [None]:
df['seller'].isna().sum()

In [None]:
df['seller'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['seller'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

### Observations:
- The column represents the entity responsible for selling the property.  
- There are no missing values in this column.  
- 68.19% of properties are listed by estate agents acting as brokers to facilitate sales.  
- 28.79% of properties are being sold directly by individuals (could be owners or estate agents as well).  
- 3.03% of properties are listed for sale by organizations.  

# emi

In [None]:
df['emi'].head()

In [None]:
df['emi'].value_counts()

In [None]:
df['emi'].isna().sum()

In [None]:
df['emi'].describe()

### observation 
- EMI values for flat purchases, measured in lakhs.
- 2 missing values
- EMIs range from ₹745 (min) to ₹36.08 lakh (max).
- Mean EMI is ₹1.31 lakh; median is ₹0.80 lakh.
- 75% of borrowers have an EMI of ₹1.44 lakh (₹1,44,000) or less.
- High standard deviation (₹1.91 lakh) indicates wide variation.
- Data is right-skewed, with some very high EMIs.

In [None]:
sns.histplot(df['emi'],kde=True,bins=50)

### observation
- Most borrowers pay lower EMIs, likely below ₹2 lakh.
- The tail includes outliers, such as the maximum EMI of ₹36.08 lakh.
- A log transformation or outlier handling might be useful for further analysis.

In [None]:
sns.boxplot(df['emi'],color='lightgreen')
plt.grid()

### observation 
- significant right skewness in the data.
- outliers are beyond the upper whisker, including extreme values like ₹36.08 lakh.
- The box (IQR) is narrow, indicating that the middle 50% of data points (25th to 75th percentile) are tightly packed between ₹0.43 lakh and ₹1.44 lakh.
- median EMI (₹0.80 lakh) lies closer to the lower quartile, reflecting the skewed distribution.

In [None]:
#skewness and kurtosis
skewness = df['emi'].skew()
kurtosis = df['emi'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness (6.21): Highly right-skewed distribution with a long tail of high EMI values.
- Kurtosis (61.13): Leptokurtic distribution with a sharp peak and heavy tails, indicating many outliers.

In [None]:
# Drop NaN values from the column
cleaned_data_emi = df['emi'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_emi, 25)
Q3 = np.percentile(cleaned_data_emi, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
# Quantile Analysis
quantiles = df['emi'].quantile([0.10,0.75,0.80,0.90,0.95,0.98,0.99,1.00])

quantiles

In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['emi']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of emi (Log Transformed)')
plt.xlabel('emi')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
- After applying a log transformation, the EMI distribution becomes closer to normal, reducing skewness.
- The transformed data shows a more balanced spread, with fewer extreme values dominating the tail.

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['emi'], color='skyblue')
plt.title('Distribution of emi (Original)')
plt.xlabel('emi')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['emi']), color='lightgreen')
plt.title('Distribution of emi (Log Transformed)')
plt.xlabel('Log(emi)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
Bottom Left: Original EMI Boxplot  
- The original EMI data exhibits significant skewness with many outliers beyond the upper whisker.  
- Extreme values like ₹36.08 lakh are visible as outliers, affecting the overall distribution.  
Bottom Right: Log-Transformed EMI Boxplot  
- The log transformation reduces the number of visible outliers, compressing the scale of high EMI values.  
- The median and IQR are now more representative of the data, providing a clearer picture of central tendency and variability.  

In [None]:
skewness = np.log1p(df['emi']).skew()
kurtosis = np.log1p(df['emi']).kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness (1.58): Reduced after log transformation, now moderately right-skewed.
- Kurtosis (3.56): Close to normal distribution, with reduced outlier influence.

# project_in_acres

In [None]:
df['project_in_acres'].head()

In [None]:
df['project_in_acres'].value_counts()

In [None]:
df['project_in_acres'].isna().sum()

In [None]:
df['project_in_acres'].describe()

### observation 
- project built on the land in acres
- 5758 missing values
- ranges from 0 to 55000
- mean is 27.38 acre and median is 3 acre
- 75% properties have built on land 8.49acre or less
- high std-dev indicates high variation
- data is right skewed with extreme high acres values 

In [None]:
sns.histplot(df['project_in_acres'],kde=True,bins=50)

### observation 
- Highly right-skewed.  
  Most projects have small acre land.  
- few projects with extremely high acre land (>10,000 acres).  
- Large variation in project sizes.

In [None]:
sns.boxplot(df['project_in_acres'],color='lightgreen')
plt.grid()

### observation 
- Extreme values above 10,000 acres 
- most data concentrated near lower values.  
- High range with a few projects occupying vast land.

In [None]:
#skewness and kurtosis
skewness = df['project_in_acres'].skew()
kurtosis = df['project_in_acres'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness: 70.83, highly right-skewed distribution.  
- Kurtosis: 5283.91, sharp peaks and heavy tails due to outliers.  
- Data has extreme outliers, requiring transformation or outlier handling.  

In [None]:
# Drop NaN values from the column
cleaned_data_project_in_acres = df['project_in_acres'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_project_in_acres, 25)
Q3 = np.percentile(cleaned_data_project_in_acres, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
# Quantile Analysis
quantiles = df['project_in_acres'].quantile([0.10,0.75,0.80,0.90,0.95,0.98,0.99,1.00])

quantiles

In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['project_in_acres']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of project_in_acres (Log Transformed)')
plt.xlabel('project_in_acres')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
- Reduces skewness, making distribution more normal-like.  
- Majority of projects fall in lower acreage categories after transformation.  

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['project_in_acres'], color='skyblue')
plt.title('Distribution of project_in_acres (Original)')
plt.xlabel('project_in_acres')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['project_in_acres']), color='lightgreen')
plt.title('Distribution of project_in_acres (Log Transformed)')
plt.xlabel('Log(project_in_acres)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
Box Plot - Log Transformed Data:
- Reduced outlier impact after log transformation.  
- More compact distribution with fewer extreme deviations.  

In [None]:
skewness = np.log1p(df['project_in_acres']).skew()
kurtosis = np.log1p(df['project_in_acres']).kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness: 1.33, moderately right-skewed.  
- Kurtosis: 2.80, near normal distribution (mesokurtic).  
- Log transformation significantly reduces skewness and tail effects, improving distribution.  

# flooring_cluster

In [None]:
df['flooring_cluster'].head()

In [None]:
df['flooring_cluster'].value_counts()

In [None]:
df['flooring_cluster'].isna().sum()

In [None]:
df[df['flooring_cluster'] == 0].shape

In [None]:
df['flooring_cluster'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['flooring_cluster'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

In [None]:
df['flooring_cluster'].value_counts(normalize=True).cumsum()

### observation 
- The flooring cluster indicates the types of flooring used in a room. The types of flooring include: ['ceramic tiles', 'granite', 'marble', 'marbonite', 'mosaic', 'normal tiles/kotah stone', 'unknown', 'vitrified', 'wooden'].   
- A value of 1 (38.60%) means it can represent either a single flooring type or a combination of several types from the list. For example, flooring_cluster = 1 could mean just 'ceramic tiles' or a mix like ['ceramic tiles', 'granite']. The same applies to values 2 and 3, as the grouping was done during data cleaning.

# flooring_score

In [None]:
df['assigned_flooring_score'].head()

In [None]:
df['assigned_flooring_score'].value_counts()

In [None]:
df['assigned_flooring_score'].value_counts(normalize=True).cumsum()

In [None]:
df['assigned_flooring_score'].isna().sum()

In [None]:
df['assigned_flooring_score'].describe()

### observation 
- work on the flooring column again this time define the weightage for each flooring type
- A lower flooring score indicates fewer and less expensive flooring, while a higher score suggests variety of flooring, potentially at a higher cost.
- 4453 missing values
- Average weightage is 12.93.
- High variability (10.36).
- Outlier at 56.
- flooring score ranges from 4 to 56
- 90% of the data have flooring score is this 4,6,7,8,9,10,13,15,,16,17,18,19,22,23,25,26,56

In [None]:
sns.histplot(df['assigned_flooring_score'],kde=True,bins=50)

### observation
- The flooring scores are highly skewed, with most values concentrated around 7.
- Significant peaks at 7 reflect the popular flooring choices (e.g. vitrified). 
- A few higher scores (above 20) indicate outliers.

In [None]:
sns.boxplot(df['assigned_flooring_score'],color='lightgreen')
plt.grid()

### observation
- Flooring score is centered at 7
- Most data falls between 6 and 10, indicating these are the most common scores.
- A significant number of outliers exist above the upper whisker, with the maximum value reaching 56.
- The presence of many outliers suggests a right-skewed distribution.

In [None]:
#skewness and kurtosis
skewness = df['assigned_flooring_score'].skew()
kurtosis = df['assigned_flooring_score'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness (2.29): Indicates a highly positively skewed distribution with a long right tail.  
- Kurtosis (5.48): Suggests a leptokurtic distribution with heavy tails and a sharp peak.  
- Data is concentrated on the left with extreme values in the right tail.

In [None]:
# Drop NaN values from the column
cleaned_data_assigned_flooring_score = df['assigned_flooring_score'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_assigned_flooring_score, 25)
Q3 = np.percentile(cleaned_data_assigned_flooring_score, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['assigned_flooring_score']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of assigned_flooring_score (Log Transformed)')
plt.xlabel('assigned_flooring_score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
- The logarithmic transformation has effectively compressed the data range, reducing the impact of extreme values.
- The distribution is still skewed to the right but appears more normalized compared to the original data.

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['assigned_flooring_score'], color='skyblue')
plt.title('Distribution of assigned_flooring_score (Original)')
plt.xlabel('assigned_flooring_score')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['assigned_flooring_score']), color='lightgreen')
plt.title('Distribution of assigned_flooring_score (Log Transformed)')
plt.xlabel('Log(assigned_flooring_score)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation  
Log-Transformed Distribution:
- After applying a logarithmic transformation, the data distribution appears more normalized and symmetric.
- The transformation reduces the effect of outliers, as they are now within a more acceptable range.
- The spread of the data is more balanced compared to the original distribution.

In [None]:
skewness = np.log1p(df['assigned_flooring_score']).skew()
kurtosis = np.log1p(df['assigned_flooring_score']).kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness reduced to 1.16, indicating moderate positive skewness remains.
- Kurtosis decreased to 0.40, showing reduced outliers and a more normal-like distribution.
- Log transformation effectively brought the data closer to normality but slight skewness persists.

In [None]:
df.groupby("flooring_cluster")["assigned_flooring_score"].mean()

In [None]:
import seaborn as sns  
sns.boxplot(x=df["flooring_cluster"], y=df["assigned_flooring_score"])


### observation 
- Cluster 0 → No flooring (unknown).
- Cluster 1 → Low-scoring flooring, many outliers.
- Cluster 2 → Mid-range flooring, balanced spread.
- Cluster 3 → High-end flooring, wide variance.
- Clear separation → K-Means captured flooring patterns well.ll.

# assigned_amenities_score

In [None]:
df['assigned_amenities_score'].head()

In [None]:
df['assigned_amenities_score'].value_counts()

In [None]:
df['assigned_amenities_score'].isna().sum()

In [None]:
df[df['assigned_amenities_score'] == 0].shape

In [None]:
df['assigned_amenities_score'].describe()


### Observations:
- 5108 missing vaelu
- A lower amenities score indicates fewer and less expensive amenities, while a higher score suggests more amenities, potentially at a lower cost.es.
- The mean (145.91) is higher than the median (113), indicating a right-skewed distribution.
- High variability with a standard deviation of 100.52.
- The maximum score is much higher than the 75th percentile, indicating the presence of outliers in the data.
- Amenities scores range from 5 er cost.

In [None]:
sns.histplot(df['assigned_amenities_score'],kde=True,bins=50)

### Observations:
- The histogram confirms a right-skewed distribution, with most values concentrated at lower scores.
- The highest frequency is observed around 50–100 amenities score.
- Sparse values are seen beyond 400, indicating potential outliers.
- The Kernel Density Estimate (KDE) aligns with the histogram, further highlighting the skewness and the presence of a long tail.

In [None]:
sns.boxplot(df['assigned_amenities_score'],color='lightgreen')
plt.grid()

### Observations:
- IQR ranges from 70 (25th percentile) to 207 (75th percentile).
- The median (113) is closer to the lower quartile, indicating skewness.
- Outliers above 400.
- Whiskers extend from 5 to just above 400.

In [None]:
#skewness and kurtosis
skewness = df['assigned_amenities_score'].skew()
kurtosis = df['assigned_amenities_score'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness 1.01, indicating a moderately right-skewed distribution.  
- Kurtosis 0.35, suggesting the distribution is light-tailed compared to a normal distribution.  

In [None]:
# Drop NaN values from the column
cleaned_data_assigned_amenities_score = df['assigned_amenities_score'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_assigned_amenities_score, 25)
Q3 = np.percentile(cleaned_data_assigned_amenities_score, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


In [None]:
# Distribution plot with log transformation
sns.histplot(np.log1p(df['assigned_amenities_score']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of assigned_amenities_score (Log Transformed)')
plt.xlabel('assigned_amenities_score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation 
- The distribution is approximately normal, with a peak (highest frequency) around a score of 5.  
- There is a left skew, indicating some lower scores are less frequent.  
- The data appears smoothed using a kernel density estimation curve, matching the general shape of the histogram.  

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['assigned_amenities_score'], color='skyblue')
plt.title('Distribution of assigned_amenities_score (Original)')
plt.xlabel('assigned_amenities_score')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['assigned_amenities_score']), color='lightgreen')
plt.title('Distribution of assigned_amenities_score (Log Transformed)')
plt.xlabel('Log(assigned_amenities_score)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### observation

- Original Data (Left) The "assigned_amenities_score" has a right-skewed distribution with several outliers above the upper whisker.  
- Log-Transformed Data (Right) The log transformation reduces skewness, making the distribution more symmetric and minimizing the impact of outliers.

In [None]:
skewness = np.log1p(df['assigned_amenities_score']).skew()
kurtosis = np.log1p(df['assigned_amenities_score']).kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation
- Skewness (-0.403): The log-transformed data is slightly left-skewed but close to symmetric (a perfectly symmetric distribution has a skewness of 0).    
- Kurtosis (0.044): The distribution has a kurtosis close to 0, indicating it is similar to a normal distribution in terms of tail behavior and peakedness.  

# education 

In [None]:
df['education'].head(10)

### observation 
- Extract the list of distances for each property and then apply log1p (the logarithm of 1 + value) to the distances and then calculate the average

In [None]:
# Function to extract numerical values before 'km'
def extract_km_values(row):
    if pd.isnull(row):
        return None
    return [float(val) for val in re.findall(r'(\d+\.\d+)\s*km', row)]

# Function to compute log1p of distances and then take the average
def log1p_and_average(distances):
    if distances is None or len(distances) == 0:
        return None
    log1p_distances = [np.log1p(d) for d in distances]
    return np.mean(log1p_distances)

# Apply the function to extract km values and calculate the log1p and average
df['education_extracted_km'] = df['education'].apply(extract_km_values)
df['education_log1p_avg_km'] = df['education_extracted_km'].apply(log1p_and_average)

In [None]:
df['education_extracted_km'].head(20)

In [None]:
df['education_log1p_avg_km'].head(20) 

In [None]:
df['education_log1p_avg_km'].isna().sum()

In [None]:
df['education_log1p_avg_km'].describe()

In [None]:
df[df['education_log1p_avg_km'] <= 0.5].shape

In [None]:
df[df['education_log1p_avg_km'] < 1].shape

### observation 
- There are 4,266 missing values.
- A value close to 0 indicates that the school or college is near the flat, while a higher value means it is farther away from the flat.
- 0.69, showing moderate variability

In [None]:
sns.histplot(df['education_log1p_avg_km'],kde=True,bins=50)

### observation 
- histogram confirms a right-skewed distribution, with most values concentrated at lower log1p distances. 
- The highest frequency is observed around 1.0–1.5 education_log1p_avg_km, indicating that most distances fall within this range
- Sparse values are seen beyond 3.0 education_log1p_avg_km, suggesting potential outliers

In [None]:
sns.boxplot(df['education_log1p_avg_km'],color='lightgreen')
plt.grid()

### observation 
- IQR ranges from 0.95 (25th percentile) to 1.63 (75th percentile).
- Outliers are present above 2.6 education_log1p_avg_km.

In [None]:
#skewness and kurtosis
skewness = df['education_log1p_avg_km'].skew()
kurtosis = df['education_log1p_avg_km'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness: 1.72, indicating a right-skewed distribution.
- 
Kurtosis: 3.47, suggesting thleptokurtic e distribution has heavier tails and is more peaked compared to a normal distribution.

In [None]:
# Drop NaN values from the column
cleaned_data_education_log1p_avg_km = df['education_log1p_avg_km'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_education_log1p_avg_km, 25)
Q3 = np.percentile(cleaned_data_education_log1p_avg_km, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


# transport

In [None]:
df['transport'].head()

### observation 
- Extract the list of distances for each property and then apply log1p (the logarithm of 1 + value) to the distances and then calculate the average

In [None]:
# Function to extract numerical values before 'km'
def extract_km_values(row):
    if pd.isnull(row):
        return None
    return [float(val) for val in re.findall(r'(\d+\.\d+)\s*km', row)]

# Function to compute log1p of distances and then take the average
def log1p_and_average(distances):
    if distances is None or len(distances) == 0:
        return None
    log1p_distances = [np.log1p(d) for d in distances]
    return np.mean(log1p_distances)

# Apply the function to extract km values and calculate the log1p and average
df['transport_extracted_km'] = df['transport'].apply(extract_km_values)
df['transport_log1p_avg_km'] = df['transport_extracted_km'].apply(log1p_and_average)

In [None]:
df['transport_extracted_km'].head(20)

In [None]:
df['transport_log1p_avg_km'].head(20) 

In [None]:
df['transport_log1p_avg_km'].isna().sum()

In [None]:
df['transport_log1p_avg_km'].describe()

In [None]:
df[df['transport_log1p_avg_km'] <= 0.5].shape

In [None]:
df[df['transport_log1p_avg_km'] < 1].shape

### observation 
- There are 6260 missing values.
- A value close to 0 indicates that the transport facility is near the flat, while a higher value means it is farther away from the flat.
- 0.67, showing moderate variability

In [None]:
sns.histplot(df['transport_log1p_avg_km'],kde=True,bins=50)

### observation 
- histogram confirms a right-skewed distribution, with most values concentrated at lower log1p distances. 
- The highest frequency is observed around 0.8–1.4 transport_log1p_avg_km, indicating that most distances fall within this range
- Sparse values are seen beyond 2.5 transport_log1p_avg_km, suggesting potential outliers

In [None]:
sns.boxplot(df['transport_log1p_avg_km'],color='lightgreen')
plt.grid()

### observation 
- IQR ranges from 0.87 (25th percentile) to 1.42 (75th percentile).
- Outliers are present above 2.23 transport_log1p_avg_km.

In [None]:
#skewness and kurtosis
skewness = df['transport_log1p_avg_km'].skew()
kurtosis = df['transport_log1p_avg_km'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness: 0.69, data is slightly skewed to the right
- Kurtosis: 1.44, data has light tails (fewer extreme values).

In [None]:
# Drop NaN values from the column
cleaned_data_transport_log1p_avg_km = df['transport_log1p_avg_km'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_transport_log1p_avg_km, 25)
Q3 = np.percentile(cleaned_data_transport_log1p_avg_km, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)

# shopping_centre

In [None]:
df['shopping_centre'].head()

### observation 
- Extract the list of distances for each property and then apply log1p (the logarithm of 1 + value) to the distances and then calculate the average

In [None]:
# Function to extract numerical values before 'km'
def extract_km_values(row):
    if pd.isnull(row):
        return None
    return [float(val) for val in re.findall(r'(\d+\.\d+)\s*km', row)]

# Function to compute log1p of distances and then take the average
def log1p_and_average(distances):
    if distances is None or len(distances) == 0:
        return None
    log1p_distances = [np.log1p(d) for d in distances]
    return np.mean(log1p_distances)

# Apply the function to extract km values and calculate the log1p and average
df['shopping_centre_extracted_km'] = df['shopping_centre'].apply(extract_km_values)
df['shopping_centre_log1p_avg_km'] = df['shopping_centre_extracted_km'].apply(log1p_and_average)

In [None]:
df['shopping_centre_extracted_km'].head(20)

In [None]:
df['shopping_centre_log1p_avg_km'].head(20) 

In [None]:
df['shopping_centre_log1p_avg_km'].isna().sum()

In [None]:
df['shopping_centre_log1p_avg_km'].describe()

In [None]:
df[df['shopping_centre_log1p_avg_km'] <= 0.5].shape

In [None]:
df[df['shopping_centre_log1p_avg_km'] < 1].shape

### observation 
- There are 3887 missing values.
- A value close to 0 indicates that the shopping centre is near the flat, while a higher value means it is farther away from the flat.
- 0.52, showing moderate variability

In [None]:
sns.histplot(df['shopping_centre_log1p_avg_km'],kde=True,bins=50)

### observation 
- histogram confirms a right-skewed distribution, with most values concentrated at lower log1p distances. 
- The highest frequency is observed around 0.8–1.55 shopping_centre_log1p_avg_km, indicating that most distances fall within this range
- Sparse values are seen beyond 2.7 shopping_centre_log1p_avg_km, suggesting potential outliers

In [None]:
sns.boxplot(df['shopping_centre_log1p_avg_km'],color='lightgreen')
plt.grid()

### observation 
- IQR ranges from 0.8 (25th percentile) to 1.55 (75th percentile).
- The median (0.97) is closer to the lower quartile, indicating right skewness.
- Outliers are present above 2.7 shopping_centre_log1p_avg_km.

In [None]:
#skewness and kurtosis
skewness = df['shopping_centre_log1p_avg_km'].skew()
kurtosis = df['shopping_centre_log1p_avg_km'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness: 0.77, indicating a moderate right-skewed distribution.
- Kurtosis: 0.96, suggests that the distribution is slightly more peaked and has heavier tails compared to a normal distribution.

In [None]:
# Drop NaN values from the column
cleaned_data_shopping_centre_log1p_avg_km = df['shopping_centre_log1p_avg_km'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_shopping_centre_log1p_avg_km, 25)
Q3 = np.percentile(cleaned_data_shopping_centre_log1p_avg_km, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


# commercial_hub

In [None]:
df['commercial_hub'].head(10)

### observation 
- Extract the list of distances for each property and then apply log1p (the logarithm of 1 + value) to the distances and then calculate the average

In [None]:
# Function to extract numerical values before 'km'
def extract_km_values(row):
    if pd.isnull(row):
        return None
    return [float(val) for val in re.findall(r'(\d+\.\d+)\s*km', row)]

# Function to compute log1p of distances and then take the average
def log1p_and_average(distances):
    if distances is None or len(distances) == 0:
        return None
    log1p_distances = [np.log1p(d) for d in distances]
    return np.mean(log1p_distances)

# Apply the function to extract km values and calculate the log1p and average
df['commercial_hub_extracted_km'] = df['commercial_hub'].apply(extract_km_values)
df['commercial_hub_log1p_avg_km'] = df['commercial_hub_extracted_km'].apply(log1p_and_average)

In [None]:
df['commercial_hub_extracted_km'].head(20)

In [None]:
df['commercial_hub_log1p_avg_km'].head(20) 

In [None]:
df['commercial_hub_log1p_avg_km'].isna().sum()

In [None]:
df['commercial_hub_log1p_avg_km'].describe()

In [None]:
df[df['commercial_hub_log1p_avg_km'] <= 0.5].shape

In [None]:
df[df['commercial_hub_log1p_avg_km'] < 1].shape

### observation 
- There are 5149 missing values.
- A value close to 0 indicates that the commercial_hub is near the flat, while a higher value means it is farther away from the flat.
- 0.72, showing moderate variability

In [None]:
sns.histplot(df['commercial_hub_log1p_avg_km'],kde=True,bins=50)

### observation 
- histogram confirms a right-skewed distribution, with most values concentrated at lower log1p distances. 
- The highest frequency is observed around 1.13–2.0 commercial_hub_log1p_avg_km, indicating that most distances fall within this range
- Sparse values are seen beyond 3.4 commercial_hub_log1p_avg_km, suggesting potential outliers

In [None]:
sns.boxplot(df['commercial_hub_log1p_avg_km'],color='lightgreen')
plt.grid()

### observation 
- IQR ranges from 1.13 (25th percentile) to 2.0 (75th percentile).
- Outliers are present above 3.4 commercial_hub_log1p_avg_km.

In [None]:
#skewness and kurtosis
skewness = df['commercial_hub_log1p_avg_km'].skew()
kurtosis = df['commercial_hub_log1p_avg_km'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 
- Skewness: 0.50, indicating a moderate right-skewed distribution.
- Kurtosis: 0.23, suggests that the distribution is slightly more peaked and has slightly heavier tails compared to a normal distribution.

In [None]:
# Drop NaN values from the column
cleaned_data_commercial_hub_log1p_avg_km = df['commercial_hub_log1p_avg_km'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_commercial_hub_log1p_avg_km, 25)
Q3 = np.percentile(cleaned_data_commercial_hub_log1p_avg_km, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


# hospital

In [None]:
df['hospital'].head(10)

### observation 
- Extract the list of distances for each property and then apply log1p (the logarithm of 1 + value) to the distances and then calculate the average

In [None]:
# Function to extract numerical values before 'km'
def extract_km_values(row):
    if pd.isnull(row):
        return None
    return [float(val) for val in re.findall(r'(\d+\.\d+)\s*km', row)]

# Function to compute log1p of distances and then take the average
def log1p_and_average(distances):
    if distances is None or len(distances) == 0:
        return None
    log1p_distances = [np.log1p(d) for d in distances]
    return np.mean(log1p_distances)

# Apply the function to extract km values and calculate the log1p and average
df['hospital_extracted_km'] = df['hospital'].apply(extract_km_values)
df['hospital_log1p_avg_km'] = df['hospital_extracted_km'].apply(log1p_and_average)

In [None]:
df['hospital_extracted_km'].head(20)

In [None]:
df['hospital_log1p_avg_km'].head(20) 

In [None]:
df['hospital_log1p_avg_km'].isna().sum()

In [None]:
df['hospital_log1p_avg_km'].describe()

In [None]:
df[df['hospital_log1p_avg_km'] <= 0.5].shape

In [None]:
df[df['hospital_log1p_avg_km'] < 1].shape

### observation 
- There are 8708 missing values.
- A value close to 0 indicates that the hospital is near the flat, while a higher value means it is farther away from the flat.
- 0.25, showing moderate variability

In [None]:
sns.histplot(df['hospital_log1p_avg_km'],kde=True,bins=50)

### observation 
- histogram confirms a right-skewed distribution, with most values concentrated at lower log1p distances. 
- The highest frequency is observed around 0.53–0.83 hospital_log1p_avg_km, indicating that most distances fall within this range
- fewer observations as the values increase beyond 0.8.

In [None]:
sns.boxplot(df['hospital_log1p_avg_km'],color='lightgreen')
plt.grid()

### observation 
- IQR ranges from 0.53 (25th percentile) to 0.83 (75th percentile).
- Outliers are present above 1.28 hospital_log1p_avg_km.

In [None]:
#skewness and kurtosis
skewness = df['hospital_log1p_avg_km'].skew()
kurtosis = df['hospital_log1p_avg_km'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation
- Skewness(-0.075): The data is slightly negatively skewed (close to symmetric).  
- Kurtosis(0.011): The data has nearly zero excess kurtosis, indicating a shape close to a normal distribution.

In [None]:
# Drop NaN values from the column
cleaned_data_hospital_log1p_avg_km = df['hospital_log1p_avg_km'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_hospital_log1p_avg_km, 25)
Q3 = np.percentile(cleaned_data_hospital_log1p_avg_km, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)


# tourist

In [None]:
df['tourist'].head()

### observation 
- Extract the list of distances for each property and then apply log1p (the logarithm of 1 + value) to the distances and then calculate the average

In [None]:
# Function to extract numerical values before 'km'
def extract_km_values(row):
    if pd.isnull(row):
        return None
    return [float(val) for val in re.findall(r'(\d+\.\d+)\s*km', row)]

# Function to compute log1p of distances and then take the average
def log1p_and_average(distances):
    if distances is None or len(distances) == 0:
        return None
    log1p_distances = [np.log1p(d) for d in distances]
    return np.mean(log1p_distances)

# Apply the function to extract km values and calculate the log1p and average
df['tourist_extracted_km'] = df['tourist'].apply(extract_km_values)
df['tourist_log1p_avg_km'] = df['tourist_extracted_km'].apply(log1p_and_average)

In [None]:
df['tourist_extracted_km'].head(20)

In [None]:
df['tourist_log1p_avg_km'].head(20) 

In [None]:
df['tourist_log1p_avg_km'].isna().sum()

In [None]:
df['tourist_log1p_avg_km'].describe()

In [None]:
df[df['tourist_log1p_avg_km'] <= 0.5].shape

In [None]:
df[df['tourist_log1p_avg_km'] < 1].shape

### observation 
- There are 10396 missing values.
- A value close to 0 indicates that the tourist place is near the flat, while a higher value means it is farther away from the flat.
- 0.14, showing moderate variability

In [None]:
sns.histplot(df['tourist_log1p_avg_km'],kde=True,bins=50)

### observation 
- histogram confirms a left-skewed distribution, with most values concentrated at higher log1p distances. 
- The highest frequency is observed around 0.6–0.7 tourist_log1p_avg_km, indicating that most distances fall within this range

In [None]:
sns.boxplot(df['tourist_log1p_avg_km'],color='lightgreen')
plt.grid()

### observation 
- IQR ranges from 0.64 (25th percentile) to 0.73 (75th percentile).
- The median (0.72) is closer to the upper quartile, indicating left skewness.
- Outliers are present below 0.50 tourist_log1p_avg_km and above 0.86.

In [None]:
#skewness and kurtosis
skewness = df['tourist_log1p_avg_km'].skew()
kurtosis = df['tourist_log1p_avg_km'].kurt()

print(f"skewness : {skewness}")
print(f"kurtosis : {kurtosis}")

### observation 

- Skewness (-0.577): The data is moderately negatively skewed.  
- Kurtosis(1.121) : The data has positive kurtosis, indicating a slightly heavier-tailed distribution compared to a normal distribution.

In [None]:
# Drop NaN values from the column
cleaned_data_tourist_log1p_avg_km = df['tourist_log1p_avg_km'].dropna()

# Calculate Q1 and Q3
Q1 = np.percentile(cleaned_data_tourist_log1p_avg_km, 25)
Q3 = np.percentile(cleaned_data_tourist_log1p_avg_km, 75)

# Calculate IQR
IQR = Q3 - Q1

# Calculate lower and upper whiskers
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Print results
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")

print("Lower Whisker:", lower_whisker)
print("Upper Whisker:", upper_whisker)

In [None]:
df = df.drop(['education','education_extracted_km', 'transport','transport_extracted_km', 'shopping_centre','shopping_centre_extracted_km', 'commercial_hub','commercial_hub_extracted_km', 'hospital','hospital_extracted_km', 'tourist','tourist_extracted_km'],axis=1)

In [None]:
#df.to_csv('mg_eda_univariate_analysis_dataset.csv',index=False)

In [None]:
df.to_excel('univariate_analysis_dataset.xlsx',index=False)