# Importing Necessary Libraries:

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('max_columns', 100)
sns.set_theme(style='dark')

In [None]:
BASE_DIR = 'playground-series-s3e4.csv'
train = pd.read_csv(BASE_DIR+'train.csv')
test = pd.read_csv(BASE_DIR+'test.csv')
sub = pd.read_csv(BASE_DIR+'sample_submission.csv')

train['Source'] = 'Train'
test['Source'] = 'Test'
all_ = pd.concat([train, test], axis=0).reset_index()

# Transactions between Train and Test

#### By converting the `Time` feature into Day and Hour, we find that the train and test set comprise of the transactions of 2 days.

##### The train set has all Day 1 transactions, and some of Day 2
##### Whiel the test set has only Day 2 transactions

In [None]:
seconds_per_day = 3600*24

all_["Day"] = all_["Time"].apply(lambda x: 1 if x<seconds_per_day else 2)
all_["Hour"] = all_["Time"].apply(lambda x: (x%seconds_per_day)//3600 + 1)

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(data=all_, x='Day', hue='Source', palette=['#86A3B8',"#F55050"]);
plt.ylabel('Transactions Count')
plt.title('Transaction per Day');

#### If we look further into the Hour distribution of transactions among both datasets, we find that
##### 1. Test set is comprised of all Day 2 transactions starting from Hour 10 (Possibly 10.5)
##### 2. Train set is comprised of all Day 1 transactions, and Day 2 transactions until Hour 10 (Again possibly 10.5)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(25, 5), sharey=True)
axes = axes.flatten()

sns.countplot(data=all_.query('Source == "Train"'), palette=['#86A3B8',"#F55050"], x='Hour', hue='Day', ax=axes[0]);
axes[0].set_title('Transactions per Hour in Train')
axes[0].set_ylabel('Transactions Count')
axes[0].set_xticklabels([str(i) for i in range(1, 25)])

sns.countplot(data=all_.query('Source == "Test"'), palette=["#F55050"], x='Hour', hue='Day', ax=axes[1]);
axes[1].set_title('Transactions per Hour Test')
axes[1].set_ylabel('')
axes[1].set_xticklabels([str(i) for i in range(10, 25)]);

#### Now we know that the Test set comprises of the last 14 hours of Day 2, and we have a similar subset in the Train set which is the last 14 hours of Day 1. 

##### Let's categorize the transactions based on this and check the distribution of fraudulent transactions over the Train set.


In [None]:
all_.loc[(all_['Day'] == 1) & (all_['Hour'] <= 10), 'time_category'] = 'Day1_0'
all_.loc[(all_['Day'] == 1) & (all_['Hour'] > 10), 'time_category'] = 'Day1_1'
all_.loc[(all_['Day'] == 2) & (all_['Hour'] <= 10), 'time_category'] = 'Day2_0'
all_.loc[(all_['Day'] == 2) & (all_['Hour'] > 10), 'time_category'] = 'Day2_1'

In [None]:
all_.query('Source == "Train"').groupby('time_category').Class.sum().plot(kind='barh', color='#F55050', title='Fraudulent Transactions Over Day Periods', xlabel='Count', ylabel='Day Period', figsize=(10, 6));

#### Out of 372 fraud transactions on Day1, 310 appear on the second partition. 

##### So should we still cross validate over the **ENTIRE** Train set?
##### I don't think so, because take a look at the distribution of fraudulent transactions over the Train set.

Since the bulk of the transactions happen on Day1_1, then the cross validation should be only on Day1_1.

# Comparing CV and LB

### Let's train 2 models (LogisticRegression) using this approach.

##### In the first model, I'll use StratifiedKfold on Day1_1, and I'll add Day1_0 and Day2_1 to all folds for training, and in the second I'll do the same while dropping some features, then I'll blend their results and compare CV with LB.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

In [None]:
drop_feats = ['V1', 'V3', 'V7', 'V10', 'V16', 'V21', 'V22', 'V23', 'V25', 'V28']

X_day1_1 = all_.query('Source == "Train" and time_category == "Day1_1"').drop(['Source', 'time_category', 'Class'], axis=1).copy().reset_index(drop=True)
y_day1_1 = all_.query('Source == "Train" and time_category == "Day1_1"').Class.copy().reset_index(drop=True)

X_rest = all_.query('Source == "Train" and time_category in ["Day1_0", "Day2_0"]').drop(['Source', 'time_category', 'Class'], axis=1).copy().reset_index(drop=True)
y_rest = all_.query('Source == "Train" and time_category in ["Day1_0", "Day2_0"]').Class.copy().reset_index(drop=True)

X_test = all_.query('Source == "Test"').drop(['Source', 'time_category', 'Class'], axis=1).copy().reset_index(drop=True)

n_splits = 10

skf = StratifiedKFold(n_splits = n_splits)

oof_preds1 = np.zeros_like(y_day1_1, dtype=np.float64)
oof_preds2 = np.zeros_like(y_day1_1, dtype=np.float64)

oof_test1 = np.zeros((len(test)), dtype=np.float64)
oof_test2 = np.zeros((len(test)), dtype=np.float64)

scores1 = []
scores2 = []

for i, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_day1_1, y_day1_1), total=10)):
    # Split the data
    X_train, X_val = X_day1_1.loc[train_idx], X_day1_1.loc[val_idx]
    y_train, y_val = y_day1_1.loc[train_idx], y_day1_1.loc[val_idx]
    
    # Create two datasets
    X_train_2 = X_train.drop(drop_feats, axis=1)
    X_val_2 = X_val.drop(drop_feats, axis=1)
    
    # Fit robust scaler
    scaler1 = RobustScaler().fit(X_train)
    scaler2 = RobustScaler().fit(X_train_2)
    
    # Append Day1_0 and Day2_0
    X_train = pd.concat([X_train, X_rest], axis=0)
    X_train_2 = pd.concat([X_train_2, X_rest.drop(drop_feats, axis=1)], axis=0)    
    y_train = pd.concat([y_train, y_rest], axis=0)
    
    # Scale the datasets
    X_train = scaler1.transform(X_train)
    X_val = scaler1.transform(X_val)
    
    X_train_2 = scaler2.transform(X_train_2)
    X_val_2 = scaler2.transform(X_val_2)
    
    # Instantitate and fit model 1
    model1 = LogisticRegression(max_iter=1000, random_state=42)
    model1.fit(X_train, y_train)
    
    # Validate model 1
    val_preds1 = model1.predict_proba(X_val)[:, 1]
    score1 = roc_auc_score(y_val, val_preds1)
    scores1.append(score1)
        
    # Instantitate and fit model 2 
    model2 = LogisticRegression(max_iter=1000, random_state=42)
    model2.fit(X_train_2, y_train)
    
    # Validate model 2
    val_preds2 = model2.predict_proba(X_val_2)[:, 1]
    score2 = roc_auc_score(y_val, val_preds2)
    scores2.append(score2)
    
    # OOF preds
    oof_preds1[val_idx] += val_preds1
    oof_preds2[val_idx] += val_preds2
    
    # OOF test
    X_test1 = scaler1.transform(X_test)
    X_test2 = scaler2.transform(X_test.drop(drop_feats, axis=1))
    
    oof_test1 += model1.predict_proba(X_test1)[:, 1] / n_splits
    oof_test2 += model2.predict_proba(X_test2)[:, 1] / n_splits
    
    print(f'Fold {i} Score with features:', score1)
    print(f'Fold {i} Score without features:', score2)
    print()
    
print(f'OOF Score with features:', roc_auc_score(y_day1_1, oof_preds1))
print(f'OOF Score without features:', roc_auc_score(y_day1_1, oof_preds2))
print(f'OOF Score blend:', roc_auc_score(y_day1_1, (oof_preds1+oof_preds2)/2))

#### So dropping these features reduced the performance of the model. I now wonder if dropping them would result in the same over the LB, and that would also help me to understand whether this validation scheme is coherent with the LB.

In [None]:
sub['Class'] = oof_test1
sub.to_csv('submission1.csv', index=False)

sub['Class'] = oof_test2
sub.to_csv('submission2.csv', index=False)

sub['Class'] = (oof_test1 + oof_test2) / 2
sub.to_csv('submission3.csv', index=False)

### Comparing OOF with LB results

In [None]:
oof = [0.783, 0.746, 0.779]
lb = [0.814, 0.778, 0.809]

plt.plot(oof, color='b')
plt.plot(lb, color='r');

### This CV is perfectly correlated with the LB.