In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%run utils.py

In [None]:
# Load the datasets
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [None]:
target = 'SalePrice'

In [None]:
# Display basic information
train_df.info()

In [None]:
print(f'Train set shape: {train_df.shape}')
print(f'Test set shape: {test_df.shape}')

In [None]:
train_df.head()

## NaN inspection

#### Columns with NaN values

In [None]:
nan_df = train_df.loc[:, train_df.isna().any()]
# train_df.loc[:, train_df.isna().any(axis=0)]

nan_df.head()

In [None]:
NO_EXP = "No explaination provided."
nan_explainations = {
    "LotFrontage": NO_EXP,
    "Alley": "No alley access",
    "MasVnrType": "(?) None --> No veneer",
    "MasVnrArea": NO_EXP,
    "BsmtQual": "No Basement",
    "BsmtCond": "No Basement",
    "BsmtExposure": "No Basement",
    "BsmtFinType1": "No Basement",
    "BsmtFinType2": "No Basement",
    "Electrical": NO_EXP,
    "FireplaceQu": "No Fireplace",
    "GarageType": "No Garage",
    "GarageYrBlt": "No Garage --> No year when garage was built",
    "GarageFinish": "No Basement",
    "GarageQual": "No Basement",
    "GarageCond": "No Basement",
    "PoolQC": "No Pool",
    "Fence": "No Fence",
    "MiscFeature": "(?) None --> No other miscellaneous features",
}

In [None]:
nan_explaination_df = nan_df.isna().sum().to_frame(name="NaN Count")
nan_explaination_df["NaN Explaination"] = nan_explainations

nan_explaination_df

In [None]:
train_df[train_df["Electrical"].isna()]

In [None]:
# One time override of the default column number truncation in a displayed df

with pd.option_context('display.max_columns', None):
    display(train_df[train_df["MasVnrArea"].isna()])

## Scatter plots of X against Y

In [None]:
x_y_scatterplots(train_df, target)

## Nominal, Ordinal, Interval & Scale features indentification

- **Ordinal scale predictors cannot be included into Linear Model directly**  
  - Computing mean and addition operation is not allowed on ordinal variables

### Recoding scheme

1. **Approach 1** - recode into binary  
2. **Approach 2** - treat as interval / discrete ratio  
    1. **Recode using mid-points**  
        1. *e.g.* Duration: 1-3 weeks - 2, 4-6 weeks - 5, 7-9 weeks - 8  
        1. Problem: open intervals
            1. Max 20% of the datapoints are allowed to be in the open interval  
          
   
   2. **Code using integers making the assumption of equal intervals**
        1. *e.g.* Likert scale questionnaire
        2. *e.g.* Education: primary - 0, secondary - 1, higher - 2  


In [None]:
# Automated separation based on dtypes
non_quant_numeric_cols = ['MSSubClass', 'MoSold']
exception_cols = ['Id', 'MiscVal', target]

quantitative_vars = pd.Series(train_df.select_dtypes(include=['int64', 'float64']).columns)
quantitative_vars = quantitative_vars[~quantitative_vars.isin(non_quant_numeric_cols)
                    & ~quantitative_vars.isin(exception_cols)]
# quantitative_vars

categorical_vars = pd.Series(
    train_df.columns[~train_df.columns.isin(quantitative_vars)
                    & ~train_df.columns.isin(exception_cols)]
)
# categorical_vars

print(f"Number of columns: {len(train_df.columns)}")
print(f"Number of Quantitative variables: {len(quantitative_vars)}")
print(f"Number of Categorical variables: {len(categorical_vars)}")
print(len(quantitative_vars) + len(categorical_vars))

### Plot Quantitative features 

In [None]:
x_y_scatterplots(train_df, target, cols=quantitative_vars)

### Plot Categorical features 

In [None]:
x_y_scatterplots(train_df, target, cols=categorical_vars)

## Histogram

## Outliers

## Correlation between X and Y

## 