In [97]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer

In [98]:
train = pd.read_csv('data/initial_data/train.csv')
test = pd.read_csv('data/initial_data/test.csv')


In [99]:
def process(df, is_train = True):
    df['Price'] = df.apply(lambda x: int((x['Price'][1:]).split(".")[0].replace(",", "")), axis=1)
    df['Price_ratio'] = df.apply(lambda x: x['Price'] / x['Accommodates'], axis=1)
    
    if is_train:
        labels = df[['id','Decision']]
        X = df.drop(columns = ['Decision', 'id'], axis=1)
    else:
        labels = df['id']
        X = df.drop(columns = ['id'], axis=1)

    drop = ["Neighbourhood", 'Host_has_profile_pic', 'Property_type', 
           'Bathrooms_text', 'Balcony', 'Parking']

    col_dtypes = list(zip(X.dtypes.index, X.dtypes.values))
    numeric_cols = [x for x, y in col_dtypes if y.name in ['int64','float64'] and x not in drop]
    categorical_cols = [x for x, y in col_dtypes if y.name not in ['int64','float64'] and x not in drop]


    numeric_pipe = Pipeline([('standard_scaler', StandardScaler()), ('knn_imputer',  
                            KNNImputer(n_neighbors=5, weights="uniform"))])

    categorical_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                  ('one_hot', OneHotEncoder())])

    all_pipelines = [("drop", 'drop', drop), ("numeric", numeric_pipe, numeric_cols), 
                     ("categorical", categorical_pipe, categorical_cols)]
    pipeline = ColumnTransformer(all_pipelines, remainder='passthrough')
    
    X_transformed = pipeline.fit_transform(X)
    return X_transformed, labels

In [100]:
train, train_labels = process(train, is_train=True)
test, test_labels = process(test, is_train=False)

In [105]:
pd.DataFrame(train).to_csv('data/train.csv', index=False)
pd.DataFrame(test).to_csv('data/test.csv', index=False)
train_labels.to_csv('data/train_labels.csv', index=False)
test_labels.to_csv('data/test_labels.csv', index=False)