<a href="https://colab.research.google.com/github/Almonfrey/MAI-Course/blob/main/class6_practical_activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libraries

In [None]:
# Standard imports for data analysis
import pandas as pd  # Data processing
import numpy as np  # Numerical computing

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Matplotlib configuration for inline display (Jupyter only)
%matplotlib inline

Load data

In [None]:
# Load the housing dataset and display the first 5 rows
df = pd.read_csv("data/us_house_sales.csv")
print(df.head())

      Price                              Address         City  Zipcode State  \
0  $554,217    5926 Oak Ave, San Diego, CA 65383    San Diego    65383    CA   
1  $164,454       9583 Oak Ave, Fresno, IL 79339       Fresno    79339    IL   
2       NaN   8224 Oak Ave, Sacramento, TX 87393   Sacramento    87393    TX   
3  $189,267        232 Oak Ave, Fresno, TX 38666       Fresno    38666    TX   
4  $465,778  5446 Pine Rd, Los Angeles, CA 23989  Los Angeles    23989    CA   

  Bedrooms Bathrooms Area (Sqft)   Lot Size  Year Built  Days on Market  \
0    1 bds      3 ba         NaN  4757 sqft      1959.0             101   
1    1 bds      1 ba         NaN  3615 sqft      1969.0              46   
2    6 bds      1 ba   3630 sqft  9369 sqft      1990.0              59   
3    2 bds      1 ba    605 sqft  8804 sqft      1958.0             119   
4    3 bds      2 ba   1711 sqft  9260 sqft      2020.0              26   

  Property Type   MLS ID               Listing Agent    Status  \
0 

Data formatting

In [None]:
import pandas as pd

# Display current data types before formatting
print('Data types BEFORE formatting:')
print(df.dtypes)

print(df.head())

# 1. Price - remove $ and commas, convert to float
df['Price'] = pd.to_numeric(
    df['Price'].astype(str).str.replace(r'[\$,]', '', regex=True),
    errors='coerce'
).astype(float)

print(df.head())

# 2. Bedrooms - extract number, allow NaN, use nullable Int type
df['Bedrooms'] = pd.to_numeric(
    df['Bedrooms'].str.extract(r'(\d+)')[0],
    errors='coerce'
).astype(int)

# 3. Bathrooms - same as bedrooms
df['Bathrooms'] = pd.to_numeric(
    df['Bathrooms'].str.extract(r'(\d+)')[0],
    errors='coerce'
).astype(int)

# 4. Area (Sqft) - remove text and commas, convert to float
df['Area (Sqft)'] = pd.to_numeric(
    df['Area (Sqft)'].str.replace(r'[^\d.]', '', regex=True),
    errors='coerce'
).astype(float)

# 5. Lot Size - same as area
df['Lot Size'] = pd.to_numeric(
    df['Lot Size'].str.replace(r'[^\d.]', '', regex=True),
    errors='coerce'
).astype(float)

# Verify formatting results
print('\nData types AFTER formatting:')
print(df[['Price', 'Bedrooms', 'Bathrooms', 'Area (Sqft)', 'Lot Size']].dtypes)

Data types BEFORE formatting:
Price              object
Address            object
City               object
Zipcode             int64
State              object
Bedrooms           object
Bathrooms          object
Area (Sqft)        object
Lot Size           object
Year Built        float64
Days on Market      int64
Property Type      object
MLS ID             object
Listing Agent      object
Status             object
Listing URL        object
dtype: object
      Price                              Address         City  Zipcode State  \
0  $554,217    5926 Oak Ave, San Diego, CA 65383    San Diego    65383    CA   
1  $164,454       9583 Oak Ave, Fresno, IL 79339       Fresno    79339    IL   
2       NaN   8224 Oak Ave, Sacramento, TX 87393   Sacramento    87393    TX   
3  $189,267        232 Oak Ave, Fresno, TX 38666       Fresno    38666    TX   
4  $465,778  5446 Pine Rd, Los Angeles, CA 23989  Los Angeles    23989    CA   

  Bedrooms Bathrooms Area (Sqft)   Lot Size  Year Built  Da

Data cleaning

Missing value treatment

In [None]:
# Missing Value Treatment
print('Missing values BEFORE cleaning:')
print(df.isnull().sum())

# Strategy for each column:
median_year = df['Year Built'].median()
df.fillna({'Lot Size': df['Area (Sqft)'], 'Year Built': median_year}, inplace=True)
df.dropna(subset=['Price', 'Area (Sqft)', 'Property Type'], inplace=True)

print('Missing values AFTER cleaning:')
print(df.isnull().sum())

Missing values BEFORE cleaning:
Price             13
Address            0
City               0
Zipcode            0
State              0
Bedrooms           0
Bathrooms          0
Area (Sqft)       15
Lot Size          19
Year Built        27
Days on Market     0
Property Type     27
MLS ID             0
Listing Agent      0
Status             0
Listing URL        0
dtype: int64
Missing values AFTER cleaning:
Price             0
Address           0
City              0
Zipcode           0
State             0
Bedrooms          0
Bathrooms         0
Area (Sqft)       0
Lot Size          0
Year Built        0
Days on Market    0
Property Type     0
MLS ID            0
Listing Agent     0
Status            0
Listing URL       0
dtype: int64


Removing outliers

In [None]:
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

price_lower_bound = Q1 - 1.5 * IQR
price_upper_bound = Q3 + 1.5 * IQR

print(f'\nPrice bounds for outlier detection: Lower{price_lower_bound:,.2f}, Upper{price_upper_bound:,.2f}')

df = df[(df['Price'] >= price_lower_bound) & (df['Price'] <= price_upper_bound)]


Price bounds for outlier detection: Lower-562,417.00, Upper2,192,415.00
