In [1]:
# ! export AWS_PROFILE='sedatalakestagingreadwrite'

In [24]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from pathlib import Path
import json

from sklearn.base import clone
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.model_selection import train_test_split

# import altair as alt
import mlflow

from sklearn.metrics import  make_scorer
from lightgbm import LGBMRanker, LGBMClassifier #, __version__
from lightgbm import plot_importance


from sklearn.metrics import make_scorer, roc_auc_score, precision_score, recall_score
average_precision_scorer = make_scorer(average_precision_score, average='macro')  #, average = 'weighted')
roc_auc_scorer = make_scorer(roc_auc_score, average='macro') #average = 'weighted')

# from mlflow_utils import train_log_model

idx = pd.IndexSlice
import pylab as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

In [25]:
files = Path('./data/candidates_georef_v4').glob('*.parquet')

In [26]:
data = pd.concat((pd.read_parquet(str(f)) for f in files))

In [27]:
data.columns

Index(['sale_id', 'trxn_id', 'trxn_price', 'listing_price', 'correct_sale_id',
       'lottype_cat', 'unittype_cat', 'spatial', 'days', 'price_diff',
       'price_pct_diff', 'partial_street_score', 'partial_unit_score', 'total',
       'same_zipcode', 'listing_unit', 'closing_unit', 'listing_street',
       'closing_street', 'trxn_zip', 'trxn_city', 'trxn_state', 'listing_zip',
       'listing_city', 'area_id', 'x', 'y', 'distance', 'azimuth',
       'building_in_10', 'same_complex', 'same_building_id', 'same_block',
       'same_bin', 'same_building_bbl', 'same_bbl', 'same_unittype',
       'same_unit', 'candidates', 'trxn_candidates', 'status'],
      dtype='object')

In [28]:
mask = data[['trxn_id','sale_id']].isnull().any(1)
data = data[~mask]

  mask = data[['trxn_id','sale_id']].isnull().any(1)


In [29]:
print(f'Total pairs: {len(data):,d}, matched: {data["correct_sale_id"].nunique():,d} ({data["correct_sale_id"].nunique()/len(data):.2%})')

Total pairs: 1,075,311, matched: 3,645 (0.34%)


In [30]:
data['match'] = (data['sale_id'] == data['correct_sale_id']).astype(int)

In [31]:
data.isnull().mean().sort_values()

sale_id                 0.000000
listing_city            0.000000
area_id                 0.000000
x                       0.000000
y                       0.000000
distance                0.000000
azimuth                 0.000000
building_in_10          0.000000
same_complex            0.000000
same_building_id        0.000000
same_block              0.000000
same_bin                0.000000
same_building_bbl       0.000000
same_bbl                0.000000
same_unittype           0.000000
same_unit               0.000000
candidates              0.000000
trxn_candidates         0.000000
listing_zip             0.000000
trxn_state              0.000000
trxn_city               0.000000
trxn_zip                0.000000
trxn_id                 0.000000
trxn_price              0.000000
listing_price           0.000000
lottype_cat             0.000000
spatial                 0.000000
days                    0.000000
price_pct_diff          0.000000
price_diff              0.000000
partial_un

In [32]:
data.to_parquet('./data/train/data.parquet')

## Features

In [33]:
features = ['spatial', 'days', 'price_pct_diff', 
            'partial_unit_score',
            "partial_street_score",
            'same_unit',
            'same_building_id', 
            'same_bin',
            'same_complex',
            "candidates",
            "same_zipcode",
            "building_in_10",
#             "closing_candidates", 
            'same_unittype', 
#             'same_rd_normalized_unit',
#             'unittype_cat',
            'x', 'y', 'distance', 'azimuth'
            ]

ycol = 'match'

In [34]:

X = data[features]
y = data[ycol]

bool_cols = ['same_unit', 'same_building_id', 'same_bin', 'same_complex', 'same_zipcode', 'building_in_10', 'same_unittype']
X[bool_cols] = X[bool_cols].astype(bool)  #.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[bool_cols] = X[bool_cols].astype(bool)  #.astype(int)


In [35]:
X.to_parquet('./data/train/X.parquet')
y.to_frame().to_parquet('./data/train/y.parquet')

## Validation Set

In [36]:
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1, random_state=2024, stratify=y)

In [37]:
f"Training set size {len(ytrain):,d}, matched: {ytrain.mean():.2%}"

'Training set size 967,779, matched: 0.32%'

In [38]:
f"Validation set size {len(yval):,d}, matched: {yval.mean():.2%}"

'Validation set size 107,532, matched: 0.32%'

In [39]:
Xtrain.to_parquet('./data/train/X_train.parquet')
ytrain.to_frame().to_parquet('./data/train/y_train.parquet')

In [40]:
Xval.to_parquet('./data/train/Xval.parquet')
yval.to_frame().to_parquet('./data/train/yval.parquet')

## Dealing with Imbalance

In [41]:
from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import SMOTE, ADASYN

In [42]:
ros = RandomOverSampler(random_state=2024, sampling_strategy=0.1)
X_resampled_10, y_resampled_10 = ros.fit_resample(Xtrain, ytrain)

In [43]:
X_resampled_10.to_parquet('./data/train/X_resampled_10.parquet')
y_resampled_10.to_frame().to_parquet('./data/train/y_resampled_10.parquet')

## Splitting with Group ID (trxn id) in mind

In [44]:
from sklearn.model_selection import GroupShuffleSplit 

In [45]:
splitter = GroupShuffleSplit(test_size=.1, n_splits=2, random_state = 2024)

In [46]:
# X

In [47]:
split = splitter.split(data, groups=data['trxn_id'])
train_inds, test_inds = next(split)

In [48]:
X2train = X.iloc[train_inds]
X2val = X.iloc[test_inds]

y2train = y.iloc[train_inds]
y2val = y.iloc[test_inds]

In [49]:
X2train.to_parquet('./data/train/X_query_train.parquet')
X2val.to_parquet('./data/train/X_query_val.parquet')

y2train.to_frame().to_parquet('./data/train/y_query_train.parquet')
y2val.to_frame().to_parquet('./data/train/y_query_val.parquet')

In [50]:
train_groups = list(data.iloc[train_inds].groupby('trxn_id')['trxn_id'].count())
val_groups = list(data.iloc[test_inds].groupby('trxn_id')['trxn_id'].count())

In [51]:
trxn_ids_train = data.iloc[train_inds]['trxn_id']
trxn_ids_train.to_frame().to_parquet('./data/train/trxn_id_train.parquet')

trxn_ids_val = data.iloc[test_inds]['trxn_id']
trxn_ids_val.to_frame().to_parquet('./data/train/trxn_id_val.parquet')

In [52]:
with open('./data/train/groups.json', 'w') as f:
    json.dump(
        {
            'train': train_groups,
            'val': val_groups
        }
    , f)