In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import dotenv_values

In [2]:
config = dotenv_values('.env')

In [3]:
popular_submissions = os.listdir(config["DESPACITO"])
size = len(popular_submissions)
size

19

# Experimenting with Algorithm

In [4]:
consensus_df = pd.read_csv(config["SAMPLE_PATH"])
consensus_df = consensus_df.drop("prediction",axis=1)
consensus_df.head()

Unnamed: 0,customer_ID
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...


In [5]:
len(consensus_df)

924621

In [None]:
sub = pd.read_csv(config["DESPACITO"] + popular_submissions[0])
sub.head()

In [None]:
sub['default'] = sub['prediction'] > 0.5
sub['no_default'] = sub['prediction'] <= 0.5
sub.head()

In [None]:
consensus_df['default_count'] = 0
consensus_df['no_default_count'] = 0
consensus_df['default_count'] += sub['default']
consensus_df['no_default_count'] += ~sub['default']
consensus_df.head()

In [None]:
~sub['default']

# Actual Algorithm

In [6]:
consensus_df['default_count'] = 0
for sub in popular_submissions:
    df = pd.read_csv(config["DESPACITO"] + sub)
    df["default"] = df["prediction"] > 0.5
    consensus_df['default_count'] += df['default']
consensus_df['default_count']

0          2
1          0
2          2
3          4
4         18
          ..
924616     0
924617    18
924618     6
924619     5
924620     4
Name: default_count, Length: 924621, dtype: int64

In [7]:
consensus_df['default_pct'] = consensus_df['default_count'] / size
consensus_df.head()

Unnamed: 0,customer_ID,default_count,default_pct
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2,0.105263
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0,0.0
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,2,0.105263
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,4,0.210526
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,18,0.947368


## Save Consensus DataFrame for further reference

In [8]:
consensus_df.to_csv(config["ENGINEERED_DATA"] + "consensus.csv", index=False)

# Turn Consensus Into Training Labels

In [9]:
THRESHOLD = 0.9
default_consensus = np.where(consensus_df["default_pct"] > THRESHOLD, 1, 0)
no_default_consensus = np.where(consensus_df["default_pct"] < (1-THRESHOLD), 0, 1)

In [10]:
default_consensus

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
no_default_consensus

array([1, 0, 1, ..., 1, 1, 1])

In [12]:
labels = np.where(default_consensus == no_default_consensus, default_consensus, -127)
labels

array([-127,    0, -127, ..., -127, -127, -127])

In [13]:
np.count_nonzero(labels==1)

134356

In [14]:
np.count_nonzero(labels==0)

462715

In [15]:
np.count_nonzero(labels==-127)

327550

In [16]:
consensus_df["target"] = labels
del consensus_df['default_count'], consensus_df['default_pct']
consensus_df

Unnamed: 0,customer_ID,target
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,-127
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,-127
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,-127
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,1
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,1
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,-127
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,-127


In [17]:
consensus_df.to_csv(config["ENGINEERED_DATA"] + "test_labels.csv", index=False)