# Lab 3 : Data Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from scipy import stats

TIFFANY_BLUE='#7bf2da'
CARTIER_RED='#801B2B'
VALENTINO_PINK='#F261EE'
Hermes_GOLD='#f37021'

In [None]:
data = pd.read_csv('../data/train.csv')

### 1. Missing Value

In [None]:
# Compute missing values statistics
missing_data = data.isnull().mean() * 100
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)

# Plot missing values
plt.figure(figsize=(12, 2))
sns.barplot(x=missing_data.index, y=missing_data, color=TIFFANY_BLUE)
plt.title('Percentage of Missing Data by Feature', fontsize=15)
plt.xlabel('Features', fontsize=10)
plt.ylabel('% Missing', fontsize=10)
plt.xticks(rotation=40, ha='right')
plt.show()

In [None]:
#我們使用的三個Feature: 'GrLivArea', 'GarageArea', '1stFlrSF', 都沒有missing value

### 2. Outlier
- 三個標準差
- 68–95–99.7 rule

![](images/04.png)


In [None]:
z_scores = stats.zscore(data[['GrLivArea', 'GarageArea', '1stFlrSF']])
z_scores = z_scores.abs()
outlier_mask = (z_scores > 1).any(axis=1)
cleaned_data = data[~outlier_mask]
print(f'raw data qty: {data.shape[0]}, cleaned_data:{cleaned_data.shape[0]}')

### 3. Regression Model

In [None]:
#split train & test data
data_tr, data_ts = train_test_split(cleaned_data, test_size=0.2, random_state=4)
print(f'training data size:{data_tr.shape},  testing data size:{data_ts.shape}')

#train model
use_cols = ['GrLivArea', 'GarageArea', '1stFlrSF', 'OverallQual']
model = LinearRegression()
model.fit(data_tr[use_cols].values, data_tr['SalePrice'])

# predict and r2-score
pred_tr = model.predict(data_tr[use_cols].values)
pred_ts = model.predict(data_ts[use_cols].values)
true_tr = data_tr['SalePrice']
true_ts = data_ts['SalePrice']
print(f'training data r2 score: {r2_score(pred_tr, true_tr):.3f}')
print(f'testing data r2 score: {r2_score(pred_ts, true_ts):.3f}')


print(f'training data mse score: {mean_absolute_error(pred_tr, true_tr):.3f}')
print(f'testing data mse score: {mean_absolute_error(pred_ts, true_ts):.3f}')


In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=data_tr, color=TIFFANY_BLUE)

### Q? 
- 為什麼資料變很乾淨的時候 (只留一倍標準差內的資料), 反而r2-score變差了? 
   - 問題變簡單了, 所以對模型的要求就變高了 
       - 想像如果是小學時數學考95分會是全班第一名 
       - 但如果是大學生考小學數學題, 考95分應該會是全班最後一名