In [1]:
import acquire
import prepare

import pandas as pd
import numpy as np

import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:20,.2f}'.format

In [19]:
df = acquire.get_zillow()

In [4]:
def summarize(df):
    print('DataFrame head: \n')
    print(df.head(3))
    print('--------------')
    print('Shape:   ', df.shape)
    print('---------------')
    print('Info:    \n')
    df.info()
    print('---------------')
    print(df.describe())


In [5]:
summarize(df)

DataFrame head: 

   parcelid  basementsqft            bathrooms             bedrooms  \
0  14297519           NaN                 3.50                 4.00   
1  17052889           NaN                 1.00                 2.00   
2  14186244           NaN                 2.00                 3.00   

     calculatedbathnbr  finishedfloor1squarefeet  \
0                 3.50                       NaN   
1                 1.00                  1,465.00   
2                 2.00                       NaN   

   calculatedfinishedsquarefeet  finishedsquarefeet12  finishedsquarefeet13  \
0                      3,100.00              3,100.00                   NaN   
1                      1,465.00              1,465.00                   NaN   
2                      1,243.00              1,243.00                   NaN   

   finishedsquarefeet15  ...   censustractandblock             logerror  \
0                   NaN  ... 60,590,630,072,012.00                 0.03   
1                   N

In [6]:
def nulls_by_row(df):
    num_missing = df.isnull().sum(axis=1)
    prcnt_miss = num_missing / df.shape[1] * 100
    
    rows_missing = pd.DataFrame({'num_cols_missing': num_missing, 'percent_cols_missing' : prcnt_miss})
    rows_missing = rows_missing.reset_index().groupby(['num_cols_missing',
                'percent_cols_missing']).count().reset_index().rename(columns={'index' : 'count'})
    return rows_missing

In [7]:
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    prcnt_miss = num_missing / df.shape[0] * 100
    
    cols_missing = pd.DataFrame({'num_rows_missing' : num_missing,
                                 'percent_rows_missing' : prcnt_miss})
    return cols_missing

In [45]:
nulls_by_col(df)

Unnamed: 0,num_rows_missing,percent_rows_missing
parcelid,0,0.0
bathrooms,0,0.0
bedrooms,0,0.0
calculatedbathnbr,123,0.24
calculatedfinishedsquarefeet,69,0.13
finishedsquarefeet12,234,0.45
fips,0,0.0
fullbathcnt,123,0.24
latitude,0,0.0
longitude,0,0.0


In [42]:
nulls_by_row(df)

Unnamed: 0,num_cols_missing,percent_cols_missing,count
0,18,31.03,6
1,19,32.76,48
2,20,34.48,248
3,21,36.21,426
4,22,37.93,456
5,23,39.66,988
6,24,41.38,4244
7,25,43.1,6897
8,26,44.83,7456
9,27,46.55,16484


In [32]:
def single_family(df):
    df=df[df.propertylandusedesc == 'Single Family Residential']
    df=df[df.unitcnt <= 1]

In [33]:
df.propertylandusedesc.value_counts()

Single Family Residential    52320
Name: propertylandusedesc, dtype: int64

In [34]:
single_family(df)

In [37]:
df.shape
        

(52320, 58)

In [1]:
def handle_missing_values(df, prop_required_column=.5, prop_required_row=.75):
    threshold = int(round(prop_required_column * len(df.index), 0))
    df = df.dropna(axis=1, thresh = threshold)
    threshold = int(round(prop_required_row * len(df.columns), 0))
    df = df.dropna(axis=0, thresh = threshold)
    return df

In [2]:
df = handle_missing_values(df)

NameError: name 'df' is not defined

In [1]:
import wrangle_mall
import acquire
import prepare

from sklearn.preprocessing import MinMaxScaler


import pandas as pd
import numpy as np

In [2]:
df = wrangle_mall.get_mallcustomer_data()
df.shape

(200, 4)

In [3]:
df = wrangle_mall.detect_outliers(df)
df.shape

(198, 4)

In [4]:
df = wrangle_mall.mall_dummies(df)
df

Unnamed: 0_level_0,age,annual_income,spending_score,Male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,19,15,39,1
2,21,15,81,1
3,20,16,6,0
4,23,16,77,0
5,31,17,40,0
...,...,...,...,...
194,38,113,91,0
195,47,120,16,0
196,35,120,79,0
197,45,126,28,0


In [5]:
train, validate, test = wrangle_mall.split_mall(df)

In [6]:
s_train, s_validate, s_test = wrangle_mall.mall_scaler(train, validate, test)

In [8]:
s_validate

Unnamed: 0_level_0,age,annual_income,spending_score,Male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11,0.942308,0.036036,0.132653,1.0
84,0.538462,0.351351,0.438776,0.0
127,0.480769,0.504505,0.346939,1.0
109,0.961538,0.432432,0.428571,1.0
176,0.230769,0.657658,0.867347,0.0
173,0.346154,0.648649,0.091837,1.0
61,1.0,0.279279,0.561224,1.0
115,0.0,0.45045,0.479592,0.0
131,0.557692,0.504505,0.081633,1.0
188,0.192308,0.774775,0.683673,1.0


(array([[0.48076923, 0.2972973 , 0.5       , 0.        ],
        [0.69230769, 0.11711712, 0.13265306, 0.        ],
        [0.61538462, 0.25225225, 0.44897959, 0.        ],
        [0.75      , 0.35135135, 0.51020408, 1.        ],
        [0.26923077, 0.64864865, 0.63265306, 1.        ],
        [0.78846154, 0.7027027 , 0.13265306, 1.        ],
        [0.40384615, 0.56756757, 0.8877551 , 1.        ],
        [0.07692308, 0.37837838, 0.55102041, 0.        ],
        [0.09615385, 0.35135135, 0.52040816, 0.        ],
        [0.11538462, 0.40540541, 0.52040816, 1.        ],
        [0.96153846, 0.3963964 , 0.55102041, 0.        ],
        [0.69230769, 0.77477477, 0.23469388, 0.        ],
        [0.25      , 0.51351351, 0.71428571, 0.        ],
        [0.05769231, 0.        , 0.81632653, 1.        ],
        [0.63461538, 0.26126126, 0.5       , 0.        ],
        [0.32692308, 0.07207207, 0.98979592, 0.        ],
        [0.23076923, 0.75675676, 0.97959184, 1.        ],
        [0.038