In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#### Importing data and basic exploration

In [None]:
house_data = pd.read_csv("data/kc_house_data.csv",)

In [None]:
#Determine the size of the data
print(house_data.shape)

#Determine the columns in the data
print(house_data.columns)

#View information about the data in each columns
print(house_data.info())

#Check if the data has any missing values (null)
house_data.isna().sum()

# Checkfor outliers using multiple boxplots
for column in house_data:
    if house_data[column].dtype in ['int64','float64']:
        plt.figure()
        house_data.boxplot(column=[column])

# Although there are some outliers, we are not handling them in this exercise. 

#### Choose features to further work on
In our case, let us pick price as the target; and date, bedrooms, bathrooms, sqft_living, floors, waterfront, view, condition, grade as features


In [None]:
house_data_df = house_data[["price","date", "bedrooms", 
                                  "bathrooms", "sqft_living", "floors", 
                                  "waterfront", "view", "condition", "grade"]]

### Feature Engineering
1. Lets split date into year and month to consider that price could depend on year and month of sale due to market conditions
2. Lets treat bedrooms, bathrooms, floors, waterfront, view, condition, grade, year and month as categorical features. 


In [None]:
# Extracting year and month from date
house_data_df["year"] = house_data["date"].str[0:4]
house_data_df["month"] = house_data["date"].str[4:6]

#removing date after this extraction
house_data_df = house_data_df.drop(columns=["date"])

# Treating features as categorical
cat_features = ["bedrooms", "bathrooms", "floors", "waterfront", 
                "view", "condition", "grade", "year", "month"]
house_data_encoded = pd.get_dummies(house_data_df,columns=cat_features)
print(house_data_encoded.columns)

#### Splitting the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
house_train,house_test = train_test_split(house_data_encoded,test_size=0.2)

In [None]:
house_train.shape , house_test.shape

#### Building a model on the train data and checking how well the model fits

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
features = house_data_encoded.columns.drop("price")

In [None]:
target = ["price"]

In [None]:
model = LinearRegression()
model.fit(house_train[features],house_train[target])
model.score(house_train[features],house_train[target])

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
train_predictions = model.predict(house_train[features])
mean_squared_error(house_train[target],train_predictions)**0.5

#### Evaluate the model's performance on test data

In [None]:
test_predictions = model.predict(house_test[features])

In [None]:
mean_squared_error(house_test[target],test_predictions)**0.5