## import the pandas module, we will want both pandas and numpy

In [1]:
import pandas as pd
import numpy as np

## We can read csv files, tab files, excel files and all sorts of things

In [2]:
house_df = pd.read_csv('house_price_sample.csv')

In [3]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          800 non-null    float64
 1   bedrooms       800 non-null    int64  
 2   bathrooms      800 non-null    float64
 3   sqft_living    800 non-null    int64  
 4   sqft_lot       800 non-null    int64  
 5   floors         800 non-null    float64
 6   waterfront     800 non-null    int64  
 7   view           800 non-null    int64  
 8   condition      800 non-null    int64  
 9   grade          800 non-null    int64  
 10  sqft_above     800 non-null    int64  
 11  sqft_basement  800 non-null    int64  
 12  age.at.sale    800 non-null    int64  
 13  renovated      800 non-null    object 
 14  sqft_living15  800 non-null    int64  
 15  sqft_lot15     800 non-null    int64  
dtypes: float64(3), int64(12), object(1)
memory usage: 100.1+ KB


## Descriptive Statistics

notice how renovated is not include because it is yes/no and an object

In [4]:
house_df.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,age.at.sale,sqft_living15,sqft_lot15
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,546650.8,3.4075,2.15875,2120.6675,16502.61875,1.530625,0.0075,0.2925,3.3175,7.74875,1824.78625,295.88125,42.65,2037.83875,14474.1725
std,356698.3,0.959215,0.775355,966.854348,42803.100006,0.557124,0.086331,0.853143,0.580616,1.175734,886.831793,433.282439,29.965186,741.230647,31333.983874
min,104950.0,1.0,0.75,420.0,725.0,1.0,0.0,0.0,2.0,4.0,420.0,0.0,0.0,690.0,964.0
25%,329712.5,3.0,1.75,1427.5,5000.0,1.0,0.0,0.0,3.0,7.0,1200.0,0.0,16.0,1507.5,4999.5
50%,469950.0,3.0,2.25,1900.0,7730.5,1.5,0.0,0.0,3.0,8.0,1565.0,0.0,38.0,1880.0,7659.0
75%,650000.0,4.0,2.5,2606.25,10961.75,2.0,0.0,0.0,4.0,8.0,2292.5,550.0,61.0,2390.0,10053.5
max,3600000.0,10.0,5.25,7420.0,641203.0,3.0,1.0,4.0,5.0,12.0,7420.0,2600.0,115.0,5790.0,325393.0


## Wrangling and cleaning 

examples from McKinney

### See how easy it is to make a dummy variable

In [5]:
# This will create a dummy variable for renovated
dummies = pd.get_dummies(house_df['renovated'], prefix='renovated', drop_first=True, dtype=int)
# The output is a series with the same index as house_df
# now we want to replace renovated in the house_df with the dummy variable column
dummies
# if you want to learn more about how a function or method works, follow with '?'
# for example pd.get_dummies? will return the manual on the method

Unnamed: 0,renovated_yes
0,0
1,0
2,0
3,0
4,0
...,...
795,0
796,0
797,0
798,0


now relace the renovated column with teh dummy variable column

In [6]:
house_df_dummies = pd.concat([house_df, dummies], axis=1).drop('renovated', axis=1)

In [7]:
house_df_dummies

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,age.at.sale,sqft_living15,sqft_lot15,renovated_yes
0,264500.0,3,1.50,1580,14040,1.0,0,0,3,7,1050,530,35,2240,12000,0
1,410000.0,3,1.00,1230,7020,1.0,0,0,3,7,1090,140,91,1390,5850,0
2,625000.0,2,1.50,1490,5750,1.5,0,0,4,7,1190,300,115,1590,4025,0
3,1060000.0,4,5.25,4140,14757,2.0,0,2,3,11,4140,0,10,4440,15523,0
4,307000.0,3,2.00,1790,7259,1.0,0,0,3,7,1390,400,35,1790,7700,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,385000.0,3,2.25,2110,8000,2.0,0,0,3,8,2110,0,40,1740,7270,0
796,435000.0,4,2.50,1700,6380,1.0,0,0,4,7,850,850,75,1380,6380,0
797,205000.0,3,2.25,1250,952,3.0,0,0,3,8,1250,0,7,1250,1030,0
798,528000.0,4,2.75,2050,7171,1.0,0,0,3,8,1540,510,27,1960,7110,0


write to file

In [8]:
house_df_dummies.to_csv('house_df_dummies.csv', index=False)
# note we don't want to save the index column