In [175]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.impute import MissingIndicator

In [176]:
train = pd.read_csv('data/initial_data/train.csv')
test = pd.read_csv('data/initial_data/test.csv')

In [177]:
def process_train(df, drop):
    df['Price'] = df.apply(lambda x: int((x['Price'][1:]).split(".")[0].replace(",", "")), axis=1)
    df['Price_per_slot'] = df.apply(lambda x: x['Price'] / x['Accommodates'], axis=1)
    df['Large'] = df.apply(lambda x: 1 if x['Accommodates'] >= 8 else 0, axis=1)
    df['Missing'] = [int(sum(x) > 0) for x in MissingIndicator().fit_transform(df)]
    df['Host_is_superhost'] = df.apply(lambda x: 1 if x['Host_is_superhost'] == 'f' else 
                                       (0 if x['Host_is_superhost'] == 't' else x['Host_is_superhost']), axis=1)
    df['Host_identity_verified'] = df.apply(lambda x: 1 if x['Host_identity_verified'] == 'f' else 
                                       (0 if x['Host_identity_verified'] == 't' else x['Host_identity_verified']), axis=1)
    df['Host_is_superhost'] = df.apply(lambda x: 1 if x['Instant_bookable'] == 'f' else 
                                       (0 if x['Instant_bookable'] == 't' else x['Instant_bookable']), axis=1)
    df['Neighbourhood'] = df.apply(lambda x: str(x['Neighbourhood']), axis=1)
    print(df.shape)
    labels = df[['id','Decision']]
    X = df.drop(columns = ['Decision', 'id'], axis=1)

    drop = ['Host_has_profile_pic', 'Property_type',
           'Bathrooms_text', 'Balcony', 'Parking', 'Bedrooms', 'Beds']
    
    col_dtypes = list(zip(X.dtypes.index, X.dtypes.values))
    numeric_cols = [x for x, y in col_dtypes if y.name in ['int64','float64'] and x not in drop]
    print(numeric_cols)
    categorical_cols = [x for x, y in col_dtypes if y.name not in ['int64','float64'] and x not in drop]
    print(categorical_cols)


    numeric_pipe = Pipeline([('standard_scaler', StandardScaler()), ('knn_imputer',  
                            KNNImputer(n_neighbors=5, weights="uniform"))])

    categorical_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')),
                                  ('one_hot', OneHotEncoder())])

    all_pipelines = [("drop", 'drop', drop), ("numeric", numeric_pipe, numeric_cols), 
                     ("categorical", categorical_pipe, categorical_cols)]
    pipeline = ColumnTransformer(all_pipelines, remainder='passthrough')
    
    X_transformed = pipeline.fit_transform(X)
    return pd.DataFrame(X_transformed), labels, pipeline

def process_test(df, drop, pipeline):
    df['Price'] = df.apply(lambda x: int((x['Price'][1:]).split(".")[0].replace(",", "")), axis=1)
    df['Price_per_slot'] = df.apply(lambda x: x['Price'] / x['Accommodates'], axis=1)
    df['Large'] = df.apply(lambda x: 1 if x['Accommodates'] >= 8 else 0, axis=1)
    df['Missing'] = [int(sum(x) > 0) for x in MissingIndicator().fit_transform(df)]
    df['Host_is_superhost'] = df.apply(lambda x: 1 if x['Host_is_superhost'] == 'f' else 
                                       (0 if x['Host_is_superhost'] == 't' else x['Host_is_superhost']), axis=1)
    df['Host_identity_verified'] = df.apply(lambda x: 1 if x['Host_identity_verified'] == 'f' else 
                                       (0 if x['Host_identity_verified'] == 't' else x['Host_identity_verified']), axis=1)
    df['Host_is_superhost'] = df.apply(lambda x: 1 if x['Instant_bookable'] == 'f' else 
                                       (0 if x['Instant_bookable'] == 't' else x['Instant_bookable']), axis=1)
    df['Neighbourhood'] = df.apply(lambda x: str(x['Neighbourhood']), axis=1)
    
    labels = df['id']
    X = df.drop(columns = ['id'], axis=1)
    
    X_transformed = pipeline.transform(X)
    return pd.DataFrame(X_transformed), labels

In [178]:
drop = ['Host_has_profile_pic', 'Property_type', 'Bathrooms_text', 
        'Balcony', 'Parking', 'Bedrooms', 'Beds']

train_transformed, train_labels, pipeline = process_train(train, drop)
test_transformed, test_labels = process_test(test, drop, pipeline)

(7471, 25)
['Host_is_superhost', 'Host_identity_verified', 'Accommodates', 'Essentials', 'Cooking', 'Price', 'Number_of_reviews', 'Review_scores_rating', 'Price_per_slot', 'Large', 'Missing']
['Host_response_time', 'Neighbourhood', 'Room_type', 'Instant_bookable', 'Month']


In [179]:
train_transformed.to_csv('data/train.csv',index=False)
train_labels.to_csv('data/train_labels.csv',index=False)
test_transformed.to_csv('data/test.csv',index=False)
test_labels.to_csv('data/test_labels.csv',index=False)