## Imports

In [1]:
import pandas as pd
import numpy as np

## Functions

In [2]:
def get_data (path):
    df = pd.read_csv(path)
    df.head()
    df.rename(columns = {'price':'buying_price'}, inplace = True)
    return df

def col_price_median(data):
    df = data[['buying_price', 'zipcode']].groupby('zipcode').median().reset_index()
    df.columns = df.columns.str.replace('buying_price', 'price_median')
    df2 = pd.merge(data,df,on='zipcode',how='inner')
    return df2

def col_status(df2):
    for i in range( len(df2) ):
        if (df2.loc[i, 'buying_price'] < df2.loc[i, 'price_median']) & (df2.loc[i, 'condition'] >= 2):
            df2.loc[i, 'status'] = 'buy'
        else:
            df2.loc[i, 'status'] = 'not_buy'
    return df2

def col_season(data):
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
    
    data['season'] = data['date'].apply(lambda x: 
                                        'winter' 
                                        if ts('2014-05-02') <= x <= ts('2014-05-31') else
                                       'summer' 
                                        if ts('2014-06-01') <= x <= ts('2014-11-30') else
                                       'winter')
    return data

def ts(obj):
    return pd.to_datetime(obj)

def col_season_median(data):
    df = data[['season', 'zipcode', 'buying_price']].groupby(['season', 'zipcode']).median()
    df.columns = df.columns.str.replace('buying_price', 'season_median')
    df2 = pd.merge(data,df,on=['zipcode', 'season'],how='inner')
    return df2

def col_selling_price(data):
    data['selling_price'] = float(0)
    for i in range( len(data) ):
        if (data.loc[i, 'buying_price'] < data.loc[i, 'season_median']) & (data.loc[i, 'status'] == 'buy'):
            data.loc[i, 'selling_price'] = float(data.loc[i, 'buying_price']) * 1.30
        if (data.loc[i, 'buying_price'] >= data.loc[i, 'season_median']) & (data.loc[i, 'status'] == 'buy'):
            data.loc[i, 'selling_price'] = float(data.loc[i, 'buying_price']) * 1.10
    return data

def col_profit(data):
    data['profit'] = float(0)
    for i in range(len(data)):
        if data.loc[i, 'selling_price'] != 0:
            data.loc[i, 'profit'] = float(data.loc[i, 'selling_price']) - float(data.loc[i, 'buying_price'])
        else:
            None
    return data

In [42]:
# dropping a line I know is a outlier
data = data.drop(data.index[15870])

In [43]:
# data columns visualization for names
data.columns

Index(['id', 'date', 'buying_price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

# Data Overview
Here I will describe some columns I will be using for the insights and the reports

## Insights Validation
Here i will check the insights veracity and explain its results one by one.

### Hypotesis 01:
Water view properties are, at least, 20% more expensive, on avarege.

**True:** Properties with water view are 212.63% more expensive, on avarege.

In [70]:
# set float configuration
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# group the prices means by waterfront (0 = nowaterview; 1 = has waterview)
price_waterview = data[['buying_price', 'waterfront']].groupby('waterfront').mean().reset_index()
price_waterview

Unnamed: 0,waterfront,buying_price
0,0,531575.29
1,1,1661876.02


In [68]:
# calculate the percentage difference of waterview prices over non-waterview prices
percentage_difference = ((price_waterview.iloc[1, 1] * 100) / price_waterview.iloc[0, 1]) - 100
print(f'Properties with water view are {percentage_difference:.2f}% more expensive, on avarege')

Properties with water view are 212.63% more expensive, on avarege


### Hypotesis 02:
Properties built before 1955 are more than 50% cheaper, on avarege.

**False**: The properties built before 1955 are, actually, 4.46% more expensive, on avarage

In [96]:
# get the prices before 1955
before_year = data.loc[(data['yr_built'] < 1955), ['yr_built', 'buying_price']
        ].groupby('yr_built').mean().reset_index()

# get the mean of the before-1955 prices
mean_before = before_year['buying_price'].mean()

# get the prices after 1955
after_year = data.loc[(data['yr_built'] >= 1955), ['yr_built', 'buying_price']
        ].groupby('yr_built').mean().reset_index()

# get the mean of the before-1955 prices
mean_after = after_year['buying_price'].mean()


In [97]:
# get the percentage difference (result)
percentage = (((mean_after * 100) / mean_before) - 100) * -1

# print the result
print(f' The properties built before 1955 are, actually, {percentage:.2f}% more expensive, on avarage')

 The properties built before 1955 are, actually, 4.46% more expensive, on avarage


### Hypothesis 03:
Properties without a basement have a total area, at least, 40% larger than properties with basement, on avarage.

**False**: Properties without basement are 18.41% larger than the ones with basement, on avarage


In [108]:
# get a copy of the dataset
df = data[['sqft_basement', 'sqft_lot']].copy()

# create 'has_basement' column (boolean)
df['has_basement'] = df['sqft_basement'].apply(lambda x: True if x != 0 else False)
df = df[['sqft_lot', 'has_basement']].groupby('has_basement').mean().reset_index()
df

Unnamed: 0,has_basement,sqft_lot
0,False,16284.75
1,True,13287.15


In [114]:
# calculate the percentage difference
# properties without basement = 100%
# properties with basement = x (81.59%)
x = df.iloc[1 ,1] * 100 / df.iloc[0, 1]

# 100 - 81.59 is the size difference on avarage
print(f'Properties without basement, are {100 - x:.2f}% larger than the ones with basement, on avarage')

Properties without basement, are 18.41% larger than the ones with basement, on avarage


### Hypothesis 04:
The property price growth YoY (Year over Year) is more than 10%.

****