## III. Imputation and Encoding


### Get data and desired features

In [None]:
# data = pd.read_csv('../data_built_features.csv')

In [None]:
# data.shape

In [None]:
for col in data.columns.values:
    print col

In [None]:
# we should return to this and check which covariates we've selected 

cols = ['inspection_date',
        'facility_type',
        'latitude',
        'longitude',
        'results',
        'risk',
        'business_activity_Consumption of Liquor on Premises',
        'business_activity_Preparation of Food and Dining on Premise With Seating',
        'business_activity_Retail Sale of Tobacco',
        'business_activity_Retail Sales of Packaged Liquor',
        'business_activity_Retail Sales of Perishable Foods',
        'Canvass', # inspection_type
        'License',
        'Canvass Re-Inspection',
        'Complaint',
        'License Re-Inspection',
        'Short Form Complaint',
        'Complaint Re-Inspection',
        'Suspected Food Poisoning',
        'Consultation',
        'License-Task Force',
        'point_crime_count',
        'point_sanit_count',
        'result_binary',
        'TMAX',
        'TMAX_3DayAvg',
        'TMAX_10DayAvg',
        'TMAX_30DayAvg',
        'previous_count',
        'previous_fraction',
        'previous_result',
        'time_since_last_inspection',
        'previous_violations',
        'previous_citations',
        'previous_critical',
        'previous_serious',
        'previous_minor',
        'previous_corrected']

print data[cols].shape
print data.shape

# select specific features
data_sub = data[cols]
# what would happen if we dropped NaNs
print data_sub.dropna(axis=0, how='any').shape

In [None]:
data_sub = data_sub.copy()

In [None]:
# temporary -- dumping correct crime data and sanitation complaint data into dataset
data_dump = pd.read_csv('../data_with_crime_sanit.csv')

In [None]:
data_sub['crime'] = data_dump['crime'].copy()
data_sub['sanit'] = data_dump['sanitation'].copy()

In [None]:
# convert string to datetime to be safe
data_sub['inspection_date'] = pd.to_datetime(data_sub['inspection_date'])

print data_sub['inspection_date'].min()
print data_sub['inspection_date'].max()


### Imputation and encoding 

In [None]:
# non-KNN approach:
# replace null values with feature mean (quantitative) or feature mode (qualitative)
# for column in data_sub.columns:
#     if (data_sub[column].unique().shape[0] < 8) or (data_sub[column].dtype == np.dtype('object')):
#         data_sub[column].fillna(value = data_sub[column].mode()[0], inplace = True)
#     else:
#         data_sub[column].fillna(value = data_sub[column].mean(), inplace = True)

# check no null values
nan_cnt = 0
for i, column in enumerate(data_sub.columns):
    print i, column, np.array(data_sub[column].isnull()).sum()
    nan_cnt = nan_cnt + np.array(data_sub[column].isnull()).sum()
    
print 'total cells:', data_sub.size
print 'nan_cnt:', nan_cnt
print 'ratio for sam:', nan_cnt / float(data_sub.size)



In [None]:
# how should we handle the historical count data?
# there's no optimal solution: either (1) impute, biasing that they wouldn't have done well 
# or (2) set to 0, biasing that they would've done well? We'll set to 0 for simplicity.

print 'count of obs for which previous_count == 0:', data_sub[data_sub['previous_count'] == 0.0].shape[0]
print 'this corresponds with number of nans for previous_fraction, previous_result, time_since_last_inspection, which is what we\'d expect'

# set all these to 0 then
hist_cols = ['previous_fraction',
            'previous_result',
            'time_since_last_inspection',
            'previous_violations',
            'previous_citations',
            'previous_critical',
            'previous_serious',
            'previous_minor',
            'previous_corrected']

for hist_col in hist_cols:
    data_sub.loc[data_sub['previous_count'] == 0.0, hist_col] = data_sub.loc[data_sub['previous_count'] == 0.0, hist_col].fillna(value = 0)
    
print '\n here are NaN counts after this step: \n'
for i, column in enumerate(data_sub.columns):
    print i, column, np.array(data_sub[column].isnull()).sum()
    
print '37-42 still have a uniform number of null values because they had no text from the inspection, so nothing to scrape'

In [None]:
# obs with no null values
print 'count of obs with no null values:', data_sub.shape[0] - data_sub.isnull().any(axis = 1).sum()
print 'count of features with no null values:', data_sub.shape[1] - data_sub.isnull().any(axis = 0).sum()

In [None]:
# features with no null values we can use to impute
data_sub.isnull().any(axis = 0)[data_sub.isnull().any(axis = 0) == False].index.values

In [None]:
# so that the imputer has enough predictors, to work with, fill in risk by mean / mode approach since only 63 obs have nulls

data_sub['risk'].fillna(value = data_sub['risk'].mode()[0], inplace = True)

# same
# data_sub['inspection_type'].fillna(value = data_sub['inspection_type'].mode()[0], inplace = True)

# check
print 'number of null values in risk:', data_sub['risk'].isnull().sum()
# print 'number of null values in inspection_type:', data_sub['inspection_type'].isnull().sum()

In [None]:
# KNN 
from sklearn.neighbors import NearestNeighbors

In [None]:
# have to encode the data first... but facility_type is annoying to do...
print data_sub.facility_type.value_counts()[0:10]

take_types = data_sub.facility_type.value_counts()[0:50].index.values

In [None]:
# why don't we just take the first fifty for now 
# this takes awhile... a faster approach?
#
# READ THIS IN, DONT BOTHER RUNNING AGAIN
#
for i, row in data_sub[['facility_type']].iterrows():
    if row.values not in take_types:
        data_sub.loc[i, 'facility_type'] = 'Other'
    else:
        continue
    if (i % 500 == 0):
        print 'finished iteration:', i
        
        
# it's encoded in this, so just kidding
# hack_data = data = pd.read_csv('../data_ready.csv')
# hack_data['facility_type']

In [None]:
# check response
print 'Useless response count:', sum((data_sub.results != 'Pass') & (data_sub.results != 'Pass w/ Conditions') & (data_sub.results != 'Fail'))
indices = (data_sub.results == 'Pass') | (data_sub.results == 'Pass w/ Conditions') | (data_sub.results == 'Fail')
data_sub = data_sub[indices]

In [None]:
# risk
indices = (data_sub.risk == 'Risk 1 (High)') | (data_sub.risk == 'Risk 2 (Medium)') | (data_sub.risk == 'Risk 3 (Low)')
data_sub = data_sub[indices]

In [None]:
data_sub = data_sub.reset_index(drop=True)

In [None]:
# have to these encode first
data_sub_encoded = pd.DataFrame({})
for column in data_sub.columns:
#     if (data_sub[column].unique().shape[0] < 8) or data_sub[column].dtype == np.dtype('object'):
    if (data_sub[column].dtype == np.dtype('object')) & (column not in ['results']):
        encoding = pd.get_dummies(data_sub[column])
        data_sub_encoded = pd.concat([data_sub_encoded, encoding], axis = 1)
    else:
#         data_sub_encoded = pd.concat([data_sub_encoded, data_sub[[column]].astype(float)], axis = 1)
        data_sub_encoded = pd.concat([data_sub_encoded, data_sub[[column]]], axis = 1)

In [None]:
# train data are observations that are complete
# test data (which we wish to impute) are observations that are incomplete
train = data_sub_encoded[np.logical_not(data_sub_encoded.isnull().any(axis = 1).values)]
test = data_sub_encoded[data_sub_encoded.isnull().any(axis = 1).values]

In [None]:
# reset indices because we've dropped rows, NearestNeighbors was returning wrong indices because they weren't reset
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
# dont actually have to split x and y since we specify what features we're imputing on in impute_cols
# x_train = train.drop(['result_binary', 'results'], axis = 1) 
# x_test = test.drop(['result_binary', 'results'], axis = 1) 

In [None]:
data_sub.isnull().any(axis = 0)[data_sub.isnull().any(axis = 0) == False].index.values

In [None]:
# should be using 'inspection_type', facility_type' too but i will return to this, was frustrated with the encoding!
# these have to be encoded to work with knn, but when encoded it is hard to get them...

impute_cols = ['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'previous_count', 'previous_fraction',
       'previous_result', 'time_since_last_inspection']

In [None]:
knn = NearestNeighbors(n_neighbors = 5)
knn.fit(train[impute_cols]) 

In [None]:
# return k nearest neighbors based on features we have for all observations
# then fill in other values using these nearest neighbors 

import math
from scipy import stats

for i, row in test.iterrows(): # for each observation with incomplete features
    ind = knn.kneighbors(X = row[impute_cols].values.reshape(1,-1), return_distance=False)[0] # return indices of nearest neighbors with complete features
    for col in test.columns.values[row.isnull().values]: # for each feature that're NaNs    
        train_vals = np.array(train.loc[ind, col]) # get vals from nearest neighbors for this col
        if train[col].unique().shape[0] <= 2: # if indicator 
            test.loc[i, col] = stats.mode(train_vals)[0][0] # fill w/ mode
        else:
            test.loc[i, col] = train_vals.mean() # fill w/ mean
    if (i % 500 == 0):
        print 'finished iteration:', i

In [None]:
# this won't work, apply flattens the thing
# import math
# from scipy import stats

# def impute_row(row):
#     ind = knn.kneighbors(X = row[impute_cols].values.reshape(1, -1), return_distance=False)[0]
#     for col in test.columns.values[row.isnull().values]: # for each feature that're NaNs    
#         train_vals = np.array(train.loc[ind, col])
#         if train[col].unique().shape[0] <= 2:
#             row[col] = stats.mode(train_vals)[0][0]
#         else:
#             row[col] = train_vals.mean()

# test = test.apply(impute_row, axis = 1)


In [None]:
data_ready = pd.concat([train, test], axis = 0)
data_ready = data_ready.reset_index(drop=True)

In [None]:
# reorder = ['inspection_date','result_binary','results','point_crime_count', 'point_sanit_count',
#       'latitude', 'longitude','TMAX','TMAX_3DayAvg', 'TMAX_10DayAvg', 'TMAX_30DayAvg',
#      'Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
#         'previous_count','previous_fraction', 'previous_result',
#        'time_since_last_inspection', 'previous_violations',
#        'previous_citations', 'previous_critical', 'previous_serious',
#        'previous_minor', 'previous_corrected','inspection_type',
#         "1023 CHILDERN'S SERVICES FACILITY",
#        "1023-CHILDREN'S SERVICES FACILITY", 'ASSISTED LIVING', 'BANQUET',
#        'BANQUET HALL', 'Bakery', 'CAFETERIA', 'CHURCH', 'CHURCH KITCHEN',
#        'CONVENIENCE', 'CONVENIENCE STORE', 'CULINARY SCHOOL', 'Catering',
#        "Children's Services Facility", 'DAYCARE', 'Daycare (2 - 6 Years)',
#        'Daycare (2 Years)', 'Daycare (Under 2 Years)',
#        'Daycare Above and Under 2 Years', 'Daycare Combo 1586',
#        'GAS STATION', 'GAS STATION/MINI MART', 'GROCERY/RESTAURANT',
#        'Golden Diner', 'Grocery Store', 'Hospital', 'KIOSK',
#        'LIVE POULTRY', 'Liquor', 'Long Term Care', 'Mobile Food Dispenser',
#        'Mobile Food Preparer', 'Navy Pier Kiosk', 'Other',
#        'PRIVATE SCHOOL', 'RESTAURANT/BAR', 'RESTAURANT/GROCERY STORE',
#        'ROOF TOPS', 'ROOFTOP', 'Restaurant', 'STADIUM', 'STORE', 'School',
#        'Shared Kitchen', 'Shared Kitchen User (Long Term)',
#        'Shared Kitchen User (Short Term)', 'Shelter', 'Special Event',
#        'TAVERN', 'Wholesale', 'convenience store', 
#        'business_activity_Catering of Liquor To Events',
#        'business_activity_Consumption of Liquor on Premises',
#        'business_activity_Hotel - 7 or More Sleeping Rooms',
#        'business_activity_Operation of a Fuel Filling Station',
#        'business_activity_Preparation of Food and Dining on Premise With Seating',
#        'business_activity_Provides Onsite Amusement or Entertainment',
#        'business_activity_Retail Sale of Tobacco',
#        'business_activity_Retail Sales of Packaged Liquor',
#        'business_activity_Retail Sales of Packaged Liquor on Sundays from 8AM - 11AM | Retail Sales of Packaged Liquor',
#        'business_activity_Retail Sales of Packaged Liquor | Retail Sales of Packaged Liquor on Sundays from 8AM - 11AM',
#        'business_activity_Retail Sales of Perishable Foods',
#        'business_activity_Retail Sales of Tobacco Products',
#        'business_activity_Sale of Food Prepared Onsite With Dining Area',
#        'business_activity_Sale of Food Prepared Onsite Without Dining Area',
#        'business_activity_Sale of Liquor Outdoors on Private Property',
#        'business_activity_Sale of Liquor Until 4am, Monday - Saturday and 5am on Sunday',
#        'business_activity_Supervision of, and Care for, Children 0-6 Years of Age, During the Day Between 6am-9pm',
#        'business_activity_Supervision of, and Care for, Children 2-6 Years of Age, During the Day Between 6am-9pm',
#        'business_activity_Supervision of, and Care for, Children Under 2 Years of Age, During the Day Between 6am-9pm | Supervision of, and Care for, Children 2-6 Years of Age, During the Day Between 6am-9pm',
#        'business_activity_Tavern - Consumption of Liquor on Premise']

# data_ready = data_ready[reorder]

In [None]:
# so don't have to rerun the code above
data_ready.to_csv('/Users/jeremywelborn1/Documents/Jeremy/Harvard/Classes/III_Junior/1st_CS109a/fggw/data_ready.csv')