In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

#utility
from helper import utility

In [None]:
#load data 
prop_2016 = utility.load_data('data/properties_2016.csv')
prop_2017 = utility.load_data('data/properties_2017.csv')
#train_2016 = utility.load_data('data/train_2016_v2.csv')
#train_2017 = utility.load_data('data/train_2017.csv')
train_2016=pd.read_csv('data/train_2016_v2.csv', parse_dates=["transactiondate"])
train_2017=pd.read_csv('data/train_2017.csv', parse_dates=["transactiondate"])

In [None]:
#see data
print(prop_2016.shape)
prop_2016.head()


In [None]:
#see data
print(train_2016.shape)
train_2016.head()

In [None]:
#join training data with properties data 
merged_train_2016=pd.merge(train_2016,prop_2016, on='parcelid', how='left')
merged_train_2017=pd.merge(train_2017,prop_2017, on='parcelid', how='left')
merged_train_2016.head()

In [None]:
#looking at the data types of each variables
merged_train_2016.dtypes

In [None]:
#see percentage of missing features in data
utility.print_percent_missing(merged_train_2016)

In [None]:
#correlation coefficient 

In [None]:
#looking at distribution of log error 
print(train_2016['logerror'].describe())
train_2016['logerror'].hist()

In [None]:
#removing outliers in logerror for better distribution
ulimit = np.percentile(train_2016['logerror'].values, 99)
llimit = np.percentile(train_2016['logerror'].values, 1)

train_2016['logerror'].loc[train_2016['logerror'] > ulimit] = ulimit
train_2016['logerror'].loc[train_2016['logerror'] < llimit] = llimit

plt.figure(figsize = (12, 8))
train_2016['logerror'].hist(bins=20)
plt.xlabel('logerror', fontsize = 12)
plt.show()

In [None]:
#observing datetime transaction 

train_2016['transaction_month'] = train_2016['transactiondate'].dt.month

count = train_2016['transaction_month'].value_counts()
plt.figure(figsize = (12,6))
sb.barplot(x=count.index, y=count.values)
plt.xlabel('Month of transaction', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.show()

In [None]:
#longitude and latitude distribution 
plt.figure(figsize=(12,12))
sb.jointplot(x=prop_2016['latitude'].values, y=prop_2016['longitude'].values)
plt.ylabel('Longitude', fontsize=12)
plt.xlabel('Latitude', fontsize=12)
plt.show()

In [None]:
#logerror with respect to latitude and longitude 
from plotnine import ggplot, aes, labs, geom_point
from plotnine.scales import scale_color_gradient
from ggplot import *
ggplot(train_2016) +\
    aes(x='latitude', y='longitude', color = 'logerror') + \
    geom_point()  + \
    scale_color_gradient(low = 'red', high = 'blue')