# Tutorial home sales

In [1]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
# Grid Search

## 1. 데이터 살펴보기
pandas의 read_csv 함수를 사용해 데이터를 읽어오고, 각 변수들이 나타내는 의미를 살펴보겠습니다.
1. ID : 집을 구분하는 번호
2. date : 집을 구매한 날짜
3. price : 타겟 변수인 집의 가격
4. bedrooms : 침실의 수
5. bathrooms : 화장실의 수
6. sqft_living : 주거 공간의 평방 피트
7. sqft_lot : 부지의 평방 피트
8. floors : 집의 층 수
9. waterfront : 집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)
10. view : Has been viewed
11. condition : 집의 전반적인 상태
12. grade : King County grading 시스템 기준으로 매긴 집의 등급
13. sqft_above : 지하실을 제외한 평방 피트
14. sqft_basement : 지하실의 평방 피트
15. yr_built : 집을 지은 년도
16. yr_renovated : 집을 재건축한 년도
17. zipcode : 우편번호
18. lat : 위도
19. long : 경도
20. sqft_living15 : 2015년 기준 주거 공간의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)
21. sqft_lot15 : 2015년 기준 주차 공간의 평방 피트(집을 재건축했다면, 변화가 있을 수 있음)

In [2]:
addr = "./data/"
df_train = pd.read_csv(os.path.join(addr,'train.csv'))
df_test = pd.read_csv(os.path.join(addr,'test.csv'))

df_train.info()

df_train.head()

df_test.info()

df_test.head()

# EDA

#### 1. Check missing data

def check_missing(df):
    print("Number of Missing data at each column")
    for column in df.columns:
        a = df[column].isnull().sum()
        print(f"{column:20} :{a}")

check_missing(df_train)
print()
check_missing(df_test)

df_train.drop(['sqft_living', 'sqft_lot'],axis=1, inplace=True)
df_test.drop(['sqft_living', 'sqft_lot'], axis=1, inplace=True)

df_train.shape, df_test.shape

In [None]:
df_train["date"] = df_train["date"].apply(lambda x: int(x[:8]))
df_test["date"] = df_test["date"].apply(lambda x: int(x[:8]))

In [None]:
df_train["year"] = df_train.date.apply(lambda x: x // 10000)
df_train["month"] = df_train.date.apply(lambda x: (x%10000) // 100)
df_test["year"] = df_test.date.apply(lambda x: x // 10000)
df_test["month"] = df_test.date.apply(lambda x: (x%10000) // 100)

In [None]:
df_train.drop(["date"],axis=1, inplace=True)
df_test.drop(["date"],axis=1, inplace=True)

In [None]:
lambda_fn = lambda row : row['year'] - row['yr_renovated'] if row['yr_renovated'] else row['year'] - row['yr_built']
df_train["age"] = df_train.apply(lambda_fn, axis=1)
df_test["age"] = df_test.apply(lambda_fn, axis=1)

def apply_func(func, df, *args):
    for col in args:
        df[col]=func(df[col])

apply_func(minmax_scale, df_train, *["age", "month"])
apply_func(minmax_scale, df_test, *["age", "month"])

target_attributes = ["sqft_living15","sqft_lot15", "sqft_above", "sqft_basement","yr_built"]
apply_func(minmax_scale, df_train, *target_attributes)
apply_func(minmax_scale, df_test, *target_attributes)

target_attributes = ["lat","long"]
apply_func(scale, df_train, *target_attributes)
apply_func(scale, df_test, *target_attributes)

df_train.drop(['year'],axis=1,inplace=True)

In [None]:
corr = df_train.corr()

In [None]:
cmap = sns.diverging_palette(220,10, as_cmap=True)

In [None]:
fig,ax = plt.subplots(1,1, figsize=(20,20))
sns.heatmap(corr, cmap = cmap, square = True, ax=ax)

In [None]:
drop_list = ['month', 'yr_built','sqft_living','sqft_lot']
df_train.drop(columns=drop_list,axis=1,inplace=True)

In [None]:
corr = df_train.corr()
fig,ax = plt.subplots(1,1, figsize=(20,20))
sns.heatmap(corr, cmap = cmap, square = True, ax=ax, center=0)

In [None]:
y_tr = df_train["price"]
df_train2 = df_train.drop(['price'],axis=1)

In [None]:
print(df_train2.columns)
df_train2.head()

In [None]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
# Cross-validation
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
# GridSearch
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale, minmax_scale


In [None]:
model = Pipeline([('poly', PolynomialFeatures(degree=2)), ('pca', PCA()),('linear',linear_model.Ridge(alpha=0.01))])

In [None]:
kfold = KFold(n_splits=5, shuffle = True, random_state = 0)
params = {'linear__alpha':[10, 100, 1000],'pca__n_components':[110,130, 150]}
grid = GridSearchCV(model, params, cv=kfold)

In [None]:
model.get_params()

In [None]:
grid.fit(df_train2.values, y_tr.values)

In [None]:
print(grid.best_score_, grid.best_params_)

cross_val_score(estimator=model,X=df_train2.values, y=y_tr.values, scoring='r2',cv=kfold)

In [None]:
grid.best_score_

In [None]:
model.fit(df_train2.values, y_tr.values)

In [None]:
pred = grid.predict(df_train2.values)
print(mean_squared_error(pred, y_tr.values)**0.5)

a = pd.DataFrame({"pred":pred,"target":y_tr.values})

In [None]:
sns.relplot(x='pred',y='target', data=a)

fig, ax = plt.subplots(13,2, figsize=(20,60))

for idx,col in enumerate(df_train.columns[3:]):
    i, j = idx//2, idx % 2 
    sns.relplot(x=col ,y='price', data=df_train, ax=ax[i][j])

* lat 47.52

interested = ["price", "bedrooms", "bathrooms", "sqft_living15", "sqft_lot15"]
sns.pairplot(data=df_train, hue="waterfront", vars=interested)

In [None]:
# interested = ["price", "month", "floors", "lat", "long","sqft_above","rooms"]
# sns.pairplot(data=df_train, hue="waterfront", vars=interested)

#### one-hot encoding

df_train = pd.get_dummies(df_train, columns=["zipcode"], prefix="zip")
df_test = pd.get_dummies(df_test, columns=["zipcode"], prefix="zip")

# df_train_1 = df_train.loc[df_train['condition']<=2]
# df_train_2 = df_train[df_train['condition']>2]
# df_test_1 = df_train.loc[df_train['condition']<=2]
# df_test_2 = df_train[df_train['condition']>2]