In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
sns.set_style('whitegrid')

In [None]:
df=pd.read_csv('AmesHousing.csv')

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

df.info(verbose=False) prints a concise summary of a DataFrame, omitting the detailed information for each column. This is especially useful for large DataFrames with many columns, where the full output of df.info() can be overwhelming and hard to read. 

In [None]:
df.info(verbose=False)

Min (Minimum): The smallest value in a dataset. It can help identify the lower bound of your data and can sometimes reveal data entry errors (e.g., a negative value for age).
Mean (Average): The sum of all values divided by the number of values in the dataset. It is a measure of the central tendency of the data but is sensitive to outliers, which can skew the result.
Max (Maximum): The largest value in a dataset. It establishes the upper boundary of your data and, like the minimum, can help identify outliers. 

In [None]:
df['SalePrice'].describe() # Check the mean, min, and max of the house prices.

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['SalePrice'], kde=True, bins=50)
plt.title('Distribution of House Sale Prices')
plt.show()

### 2. Cleaning the Data

In [None]:
'''
Task 3: Quantify Missingness 
Calculate and Sort Missing Percentages:
'''
total_missing = df.isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().count()

df.isnull().sum() / df.isnull().count(): Dividing the count of missing values by the total count of values gives the proportion of missing data for each column. The result is a pandas Series where the index is the column name and the value is the percentage (as a decimal) of missing data.

In [None]:
percent_missing=(df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)

In [None]:
missing_data=pd.concat([total_missing, percent_missing], axis=1, keys=['Total', 'Percent'])

In [None]:
missing_data.head(10)

#### Missing Data Imputation Strategy

##### 1. Drop columns with too much missing data 
First, you will remove columns that contain a high percentage of missing values to avoid introducing too much bias during imputation

In [None]:
columns_to_drop = percent_missing[percent_missing > 0.50].index

In [None]:
df_cleaned=df.drop(columns=columns_to_drop)

In [None]:
df_cleaned

##### 2. Fill categorical columns with 'None' 
For categorical features where the missing value represents the absence of a feature (e.g., no garage), you can fill NaN values with the string 'None'. 

In [None]:
cols_to_fill_none = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
                     'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                     'BsmtFinType2', 'MasVnrType']



In [None]:
for col in cols_to_fill_none:
    if col in df.columns:
        df[col].fillna('None', inplace=True)

print("Filled categorical columns with 'None'.")


##### 3. Fill numerical columns with median or zero 
For numerical columns, the choice of imputation depends on the data's distribution. The median is robust against outliers, making it a good choice for skewed data like LotFrontage. Filling with 0 is appropriate when the absence of a value truly means a zero value (e.g., in some cases for GarageYrBlt or MasVnrArea). 

In [None]:
# # Fill LotFrontage with the median
if 'LotFrontage' in df.columns:
    df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace=True)

In [None]:
# Fill GarageYrBlt with 0, as houses without a garage might have a missing year
if 'GarageYrBlt' in df.columns:
    df['GarageYrBlt'].fillna(0, inplace=True)

In [None]:
# Example for other numerical columns that can be filled with 0
if 'MasVnrArea' in df.columns:
    df['MasVnrArea'].fillna(0, inplace=True)

print("Filled numerical columns with median or 0.")

##### 4. Fill categorical columns with mode
For categorical columns with only a few missing values that aren't meaningfully absent, use the mode (most frequent value). 

In [None]:
# Fill 'Electrical' column with its mode
if 'Electrical' in df.columns:
    df['Electrical'].fillna(df['Electrical'].mode()[0], inplace=True)

print("Filled categorical column 'Electrical' with mode.")


##### 5. Drop High-Null Columns

We are dropping the columns that were nearly all null, as these are not useful for analysis.

In [None]:
# Identify columns to drop (based on a typical EDA where missing percentage is > 50-80%)
cols_to_drop = ['Alley', 'Fence']
df.drop(cols_to_drop, axis=1, inplace=True)

# Note: We are keeping the Garage and Basement columns for now, as missingness here means 'None' (absence of the feature).
print(f"Dropped columns: {cols_to_drop}")

In [None]:
df.info(verbose=False)

### Phase 3: Bivariate Analysis (Feature vs. SalePrice)

##### Task 5: Numerical Correlation Analysis ðŸ”¢

Since numerical data is easy to correlate, let's see the linear relationship between all numerical features and SalePrice.

In [None]:
# Calculate correlation matrix
corrmat = df.corr(numeric_only=True)

# Get the correlation of all features with 'SalePrice' and sort them
corr_with_saleprice = corrmat['SalePrice'].sort_values(ascending=False)

print(corr_with_saleprice.head(10)) # Top 10 positive correlations
print(corr_with_saleprice.tail(10)) # Bottom 10 (negative) correlations