In [185]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [186]:
df = pd.read_csv('../data/kc_house_data_cleaned.csv', parse_dates=['date'], index_col=[0])

In [187]:
df['renovated'] = np.where(df.yr_renovated > 0, 1, 0)
df['since_reno'] = np.where(df.yr_renovated < 2015, (1/(2015 - df.yr_renovated)), 0)

In [188]:
df['has_basement'] = np.where(df.sqft_basement > 0, 1, 0)
df['basement_lot_pct'] = df.sqft_basement / df.sqft_living * 100
df['aboveground_lot_pct'] = df.sqft_above / df.sqft_living * 100
df['PctofLot'] = round((df.sqft_living / df.sqft_lot)*100,2)

In [189]:
df['age'] = np.where(df.yr_built < 2015, (1/(2015 - df.yr_built)), 0)

In [190]:
df['bathrooms_adjusted'] = 2*df.bathrooms
df['bathrooms_adjusted'] = df.bathrooms_adjusted.astype('int64')

In [191]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'renovated', 'since_reno',
       'has_basement', 'basement_lot_pct', 'aboveground_lot_pct', 'PctofLot',
       'age', 'bathrooms_adjusted'],
      dtype='object')

In [192]:
df.dtypes

id                              int64
date                   datetime64[ns]
price                         float64
bedrooms                        int64
bathrooms                     float64
sqft_living                     int64
sqft_lot                        int64
floors                        float64
waterfront                      int64
view                            int64
condition                       int64
grade                           int64
sqft_above                      int64
sqft_basement                   int64
yr_built                        int64
yr_renovated                    int64
zipcode                         int64
lat                           float64
long                          float64
sqft_living15                   int64
sqft_lot15                      int64
renovated                       int32
since_reno                    float64
has_basement                    int32
basement_lot_pct              float64
aboveground_lot_pct           float64
PctofLot    

In [193]:
discrete = ['waterfront',  'renovated', 'has_basement', 'since_reno', 'bedrooms', 'bathrooms_adjusted', 'floors',
            'view', 'condition', 'grade', 'yr_built', 'yr_renovated', 'age']

In [194]:
continuous = ['id', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built',
              'sqft_living15', 'sqft_lot15', 'basement_lot_pct', 'aboveground_lot_pct', 
              'PctofLot', 'bathrooms']

In [195]:
X[discrete] = X[discrete].astype('int64')
X[discrete].dtypes

waterfront            int64
renovated             int64
has_basement          int64
since_reno            int64
bedrooms              int64
bathrooms_adjusted    int64
floors                int64
view                  int64
condition             int64
grade                 int64
yr_built              int64
yr_renovated          int64
age                   int64
dtype: object

In [196]:
X[continuous] = X[continuous].astype('float64')
X[continuous].dtypes

KeyError: "['id'] not in index"

In [197]:
from sklearn.feature_selection import mutual_info_regression

In [198]:
feats = [continuous] + [discrete]

In [199]:
X = df.copy()
y = df.pop('price')

for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()
    
discrete_features = X.dtypes == int

In [200]:
discrete_features

id                     False
date                   False
price                  False
bedrooms               False
bathrooms              False
sqft_living            False
sqft_lot               False
floors                 False
waterfront             False
view                   False
condition              False
grade                  False
sqft_above             False
sqft_basement          False
yr_built               False
yr_renovated           False
zipcode                False
lat                    False
long                   False
sqft_living15          False
sqft_lot15             False
renovated               True
since_reno             False
has_basement            True
basement_lot_pct       False
aboveground_lot_pct    False
PctofLot               False
age                    False
bathrooms_adjusted     False
dtype: bool

In [201]:
#discrete_features = discrete
X.drop('id', axis=1, inplace=True)
X.drop('date', axis=1, inplace=True)
X.drop('price', axis=1, inplace=True)
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,...,sqft_living15,sqft_lot15,renovated,since_reno,has_basement,basement_lot_pct,aboveground_lot_pct,PctofLot,age,bathrooms_adjusted
0,3,1.00,1180,5650,1.0,0,0,3,7,1180,...,1340,5650,0,0.000496,0,0.000000,100.000000,20.88,0.016667,2
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,...,1690,7639,1,0.041667,1,15.564202,84.435798,35.49,0.015625,4
2,2,1.00,770,10000,1.0,0,0,3,6,770,...,2720,8062,0,0.000496,0,0.000000,100.000000,7.70,0.012195,2
3,4,3.00,1960,5000,1.0,0,0,5,7,1050,...,1360,5000,0,0.000496,1,46.428571,53.571429,39.20,0.020000,6
4,3,2.00,1680,8080,1.0,0,0,3,8,1680,...,1800,7503,0,0.000496,0,0.000000,100.000000,20.79,0.035714,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,3,2.50,1530,1131,3.0,0,0,3,8,1530,...,1530,1509,0,0.000496,0,0.000000,100.000000,135.28,0.166667,5
21609,4,2.50,2310,5813,2.0,0,0,3,8,2310,...,1830,7200,0,0.000496,0,0.000000,100.000000,39.74,1.000000,5
21610,2,0.75,1020,1350,2.0,0,0,3,7,1020,...,1020,2007,0,0.000496,0,0.000000,100.000000,75.56,0.166667,1
21611,3,2.50,1600,2388,2.0,0,0,3,8,1600,...,1410,1287,0,0.000496,0,0.000000,100.000000,67.00,0.090909,5


In [202]:
discrete_features = X.dtypes == 'int64'

In [203]:
discrete_features

bedrooms                True
bathrooms              False
sqft_living             True
sqft_lot                True
floors                 False
waterfront              True
view                    True
condition               True
grade                   True
sqft_above              True
sqft_basement           True
yr_built                True
yr_renovated            True
zipcode                 True
lat                    False
long                   False
sqft_living15           True
sqft_lot15              True
renovated              False
since_reno             False
has_basement           False
basement_lot_pct       False
aboveground_lot_pct    False
PctofLot               False
age                    False
bathrooms_adjusted      True
dtype: bool

In [204]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [205]:
mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]

zipcode             0.454231
lat                 0.341712
sqft_lot15          0.238844
sqft_lot            0.122550
bedrooms            0.081188
age                 0.070475
basement_lot_pct    0.036548
condition           0.012562
yr_renovated        0.008014
Name: MI Scores, dtype: float64

In [98]:
mi_scores = make_mi_scores(X, y)
mi_scores[::3]  

TypeError: make_mi_scores() missing 1 required positional argument: 'discrete_features'

In [100]:
mi_scores = mutual_info_regression(X[discrete], y)

MemoryError: Unable to allocate 164. KiB for an array with shape (20982,) and data type int64

In [74]:
mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores

TypeError: invalid type promotion