## Function for adjusting the test dataset

In [1]:
import pandas as pd
from ipynb.fs.full.Master import distance_from

In [2]:
def clean_data(df):
    '''
    Takes in a pandas dataframe
    Applies cleaning methods for current project
    returns cleaned dataset
    '''
    
    # Fill NaN values with most commom occurances
    df['view'].fillna('NONE', inplace=True)
    df['waterfront'].fillna('NO', inplace=True)
    df['yr_renovated'].fillna(0.0, inplace=True)
    
    # Convert grade to int
    df['grade'] = df['grade'].apply(lambda x: x[:2] if x[0]=='1' else x[:1]).astype('int64')
    
    # Remove ? and convert sqft_basement to int
    df['sqft_basement'] = df['sqft_basement'].apply(lambda x: 0 if x=='?' else int(x.split('.')[0]))
    
    # Change waterfront to numeric
    df['waterfront'] = df['waterfront'].apply(lambda x: 1 if x=='YES' else 0)
    
    # Change view to numeric
    df['view'] = df['view'].apply(lambda x: 0 if x=='NONE' else (1 if x=='FAIR' else (2 if x=='AVERAGE' else (3 if x=='GOOD' else 4))))
    
    # Change condition to numeric
    df['condition'] = df['condition'].apply(lambda x: 1 if x=='Poor' else (2 if x=='Fair' else (3 if x=='Average' else (4 if x=='Good' else 5))))
    
    #Convert date column to 2 separate columns for month and year
    date = df['date'].str.split('/', expand=True)
    df['month_sold'] = date[0].astype('int64')
    df['year_sold'] = date[2].astype('int64')

    #Drop original date column
    df.drop(columns=['date'], axis=1, inplace=True)
    
#     # Change features we changed in the original dataset
#     df = df[~df['bedrooms'].isin([11, 10, 9])]
    
#     df['bathrooms'] = df['bathrooms'].apply(lambda x: np.ceil(x) if str(x)[2] == '7' else (np.ceil(x) if str(x)[2] == '5' else np.floor(x)))
#     df = df[~df['bathrooms'].isin([8.0, 7.0])]
    
#     df['floors'] = df['floors'].apply(lambda x: np.ceil(x))
    
#     lat_long = df['lat'].astype(str) + ',' + df['long'].astype(str)

#     lat_long = list(map(eval, lat_long))
#     Seattle = (47.6062, -122.3321)
#     Redmond = (47.6740, -122.1215)
#     miles_s = []
#     miles_r = []
#     for i in lat_long:
#         miles_s.append(round(distance.distance(i, Seattle).miles, 1))
#     for i in lat_long:
#         miles_r.append(round(distance.distance(i, Redmond).miles, 1))
#     df['distance_seattle'] = miles_s
#     df['distance_redmond'] = miles_r
#     df['distance_seattle'] = distance_from(47.6062, -122.3321, df['lat'], df['long'])
#     df['distance_redmond'] = distance_from(47.6740, -122.1215, df['lat'], df['long'])
    
    # Drop columns we removed from original dataset
    df.drop(columns=['date', 'grade', 'sqft_above', 'sqft_living15', 'sqft_lot15', 'id', 'zipcode', 'lat', 'long', 'condition', 'yr_built', 'sqft_lot', 'yr_renovated', 'view'], axis=1, inplace=True)

    return df