In [35]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
# from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

# import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep7/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep7/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep7/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

In [36]:
train = train.rename(columns = {'id': 'id_train'})
test = test.rename(columns = {'id': 'id_test'})

In [30]:
train.head()

Unnamed: 0,id_train,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,0,2,0,0,2,1,0,0,9,2018,1,14,1,1,11,0,67.5,0,0
1,1,2,0,1,2,0,0,0,117,2018,7,29,0,0,0,0,72.25,0,0
2,2,2,0,0,1,0,0,0,315,2018,12,2,0,0,0,0,52.0,0,0
3,3,1,0,0,2,1,0,0,32,2018,12,1,1,0,0,0,56.0,0,0
4,4,2,0,1,0,0,0,0,258,2018,10,16,0,0,0,0,100.0,0,1


In [13]:
catboost_sub = pd.read_csv('CatBoost_FS_Seed_42_CV_Score.csv')
catboost_sub.head()

Unnamed: 0,id,booking_status
0,42100,0.088827
1,42101,0.102746
2,42102,0.256219
3,42103,0.030596
4,42104,0.604456


In [14]:
train = train.drop(columns = 'id', axis = 1)
# test = test.drop(columns = 'id', axis = 1)

In [37]:
train.columns.tolist()

['id_train',
 'no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'type_of_meal_plan',
 'required_car_parking_space',
 'room_type_reserved',
 'lead_time',
 'arrival_year',
 'arrival_month',
 'arrival_date',
 'market_segment_type',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room',
 'no_of_special_requests',
 'booking_status']

In [38]:
train.columns.tolist()[0:17]

['id_train',
 'no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'type_of_meal_plan',
 'required_car_parking_space',
 'room_type_reserved',
 'lead_time',
 'arrival_year',
 'arrival_month',
 'arrival_date',
 'market_segment_type',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room']

In [15]:
duplicates = pd.merge(train, test, on = train.columns.tolist()[0:17])

In [16]:
duplicates

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,id
0,2,0,0,2,2,0,0,102,2017,10,16,0,0,0,0,109.00,1,0,57409
1,2,0,0,3,0,0,0,263,2018,10,21,0,0,0,0,110.00,0,0,51069
2,2,0,2,1,0,0,0,180,2018,8,28,1,0,0,0,96.30,2,1,65654
3,2,0,1,1,0,0,0,180,2018,5,2,1,0,0,0,100.00,1,1,53459
4,2,0,1,5,0,0,0,349,2018,10,4,0,0,0,0,96.67,0,1,45279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,2,0,0,2,0,0,0,86,2017,9,9,2,0,0,0,100.00,0,1,60396
712,2,0,1,1,0,0,0,180,2018,5,2,0,0,0,0,100.00,1,1,43885
713,2,0,2,3,0,0,0,212,2018,4,20,0,0,0,0,78.00,0,1,49594
714,1,0,0,2,0,0,0,163,2018,10,15,0,0,0,0,115.00,0,0,45238


In [17]:
duplicates = duplicates[['id', 'booking_status']]
duplicates.head()

Unnamed: 0,id,booking_status
0,57409,0
1,51069,0
2,65654,1
3,53459,1
4,45279,1


In [18]:
duplicates['flip_booking_statis'] = 1 - duplicates['booking_status']
duplicates.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates['flip_booking_statis'] = 1 - duplicates['booking_status']


Unnamed: 0,id,booking_status,flip_booking_statis
0,57409,0,1
1,51069,0,1
2,65654,1,0
3,53459,1,0
4,45279,1,0


In [19]:
catboost_sub_new = pd.merge(catboost_sub, duplicates, on = 'id', how = 'left')
catboost_sub_new.head()

Unnamed: 0,id,booking_status_x,booking_status_y,flip_booking_statis
0,42100,0.088827,,
1,42101,0.102746,,
2,42102,0.256219,,
3,42103,0.030596,,
4,42104,0.604456,,


In [20]:
catboost_sub_new['booking_status'] = np.where(np.isnan(catboost_sub_new['flip_booking_statis']), 
                                              catboost_sub_new['booking_status_x'], catboost_sub_new['flip_booking_statis'])
catboost_sub_new.head()

Unnamed: 0,id,booking_status_x,booking_status_y,flip_booking_statis,booking_status
0,42100,0.088827,,,0.088827
1,42101,0.102746,,,0.102746
2,42102,0.256219,,,0.256219
3,42103,0.030596,,,0.030596
4,42104,0.604456,,,0.604456


In [21]:
catboost_sub_new.drop(columns = ['booking_status_x', 'booking_status_y', 'flip_booking_statis'], axis = 1, inplace = True)
catboost_sub_new.head()

Unnamed: 0,id,booking_status
0,42100,0.088827
1,42101,0.102746
2,42102,0.256219
3,42103,0.030596
4,42104,0.604456


In [23]:
catboost_sub_new.to_csv('CatBoost_FS_Seed_42_CV_Score_leakage.csv', index = False)