In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import MEstimateEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
import os
from typing import List, Tuple
import joblib
import cloudpickle
import pickle
from sklearn.impute import SimpleImputer

In [8]:
test_data = pd.read_csv("../data/test.csv", index_col="index", low_memory=False)
train_data = pd.read_csv("../data/train.csv", index_col="index", low_memory=False)

categorical_features = train_data.select_dtypes(include="object").columns
numerical_features = train_data.select_dtypes(include="number").columns

numerical_features_new_set = [
    'easements',
    'lotarea',
    'bldgarea',
    'comarea',
    'resarea',
    'officearea',
    'retailarea',
    'garagearea',
    'strgearea',
    'factryarea',
    'otherarea',
    'numbldgs',
    'numfloors',
    'unitstotal',
    'lotfront',
    'lotdepth',
    'bldgfront',
    'bldgdepth',
    'assessland',
    'assesstot',
    'exemptland',
    'exempttot',
    'builtfar',
    'xcoord',
    'ycoord',
]


extra_categorical_features = [f for f in numerical_features if f not in numerical_features_new_set]

categorical_features = list(categorical_features) + extra_categorical_features
numerical_features = numerical_features_new_set


bool_mask = train_data[numerical_features].isnull().sum() <= 19211
numerical_features = np.array(numerical_features)[bool_mask].tolist()

bool_mask = train_data[categorical_features].isnull().sum() <= 12701
categorical_features = np.array(categorical_features)[bool_mask].tolist()

features_to_drop = ['yearalter1', 'yearalter2']
categorical_features = [f for f in categorical_features if f not in features_to_drop ]


one_hot_encode_columns = []
target_encode_columns = []
for f in categorical_features:
    unique_values = train_data[f].unique()
    if len(unique_values) > 5:
        target_encode_columns.append(f)
    else:
        one_hot_encode_columns.append(f)
        
one_hot_encode_columns

['borough', 'splitzone', 'irrlotcode', 'proxcode']

In [9]:
import pluto_pipeline
pipe = pluto_pipeline.PLOTUPipeline(categorical_features, numerical_features, 
                                    one_hot_encode_columns, target_encode_columns,
                                    k=25, false_ratio=2)
train_data = train_data.reset_index(drop=True)
columns = [c for c in train_data.columns if c != "target__office"]
pipe.fit(train_data[columns], train_data.target__office)



In [10]:
model_file = os.path.join("../models", "PLUTO_Pipeline.pkl")

cloudpickle.register_pickle_by_value(pluto_pipeline)
with open(model_file, 'wb') as f:
    cloudpickle.dump(pipe, f)