### In this notebook, I am doing the followings:

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing


from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib.colors import ListedColormap
from scipy.interpolate import interp1d

from fairness_measure import discrimination, all_fairness, statistical_parity, temporal_bias
from modeling import datasets, get_results, weight_estimation, reweighing
from pre_process import pre_process_raw_data, preparing_dataframe
import modeling

%config Completer.use_jedi = False

import warnings


warnings.simplefilter('ignore')

plt.rcParams['figure.figsize'] = (8, 5)
# plt.style.use('fivethirtyeight')

## Dataset Reader

In [2]:
name = 'jigsaw'

In [3]:
df = datasets(name)

In [4]:
df.shape

(60287, 103)

## Baselines
- (0) Train once + test on entire future (sequentially) (average) (no fairness)
- (1) Train once + test on entire future (sequentially) (average)
- (2) Train on entire past + test on immediate future (+ average)

- Ours (2) + forecasting 

#### (0) Train once + test on entire future (sequentially) (average) (no fairness)

In [5]:
baseline_0_results, full_results_0, batches_0 = modeling.baseline_0(df)

Train on 5
Test on 6
Test on 7
Test on 8
Test on 9
Test on 10
Test on 11
Test on 12
Test on 13
Test on 14
Test on 15
Test on 16
Test on 17
Test on 18
Test on 19
Test on 20
Test on 21
Test on 22
Test on 23


In [6]:
baseline_0_results

auc    0.749172
S.P    0.121124
TPR    0.102319
FPR    0.114711
GAP    0.108515
dtype: float64

In [7]:
baseline_0_results = baseline_0_results.to_numpy()

#### (1) Train once + test on entire future (sequentially) (average)

In [8]:
baseline_1_results, full_results_1, batches_1 = modeling.baseline_1(df)

Train on 5
Test on 6
Test on 7
Test on 8
Test on 9
Test on 10
Test on 11
Test on 12
Test on 13
Test on 14
Test on 15
Test on 16
Test on 17
Test on 18
Test on 19
Test on 20
Test on 21
Test on 22
Test on 23


In [9]:
baseline_1_results

auc    0.749469
S.P    0.095773
TPR    0.071166
FPR    0.090419
GAP    0.080792
dtype: float64

In [10]:
baseline_1_results = baseline_1_results.to_numpy()

#### 2) Train on entire past + test on immediate future (+ average)

In [11]:
baseline_2_results, full_results_2, batches_2 = modeling.baseline_3(df)

[5]
6
[5, 6]
7
[5, 6, 7]
8
[5, 6, 7, 8]
9
[5, 6, 7, 8, 9]
10
[5, 6, 7, 8, 9, 10]
11
[5, 6, 7, 8, 9, 10, 11]
12
[5, 6, 7, 8, 9, 10, 11, 12]
13
[5, 6, 7, 8, 9, 10, 11, 12, 13]
14
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
15
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
16
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
17
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
18
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
19
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
20
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
21
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
22
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
23


In [12]:
baseline_2_results

auc    0.775958
S.P    0.043486
TPR    0.043656
FPR    0.038317
GAP    0.040986
dtype: float64

In [13]:
baseline_2_results = baseline_2_results.to_numpy()

## Results

### Baselines

In [14]:
columns = ['AUC','S.P','TPR','FPR','GAP']
pd.DataFrame(np.array([baseline_0_results, baseline_1_results, baseline_2_results]), columns=columns)

Unnamed: 0,AUC,S.P,TPR,FPR,GAP
0,0.749172,0.121124,0.102319,0.114711,0.108515
1,0.749469,0.095773,0.071166,0.090419,0.080792
2,0.775958,0.043486,0.043656,0.038317,0.040986


### Ours 4 (anticipated)

### Search Space 

In [15]:
alphas = np.linspace(0, 1, 11)

In [16]:
alphas = np.round(alphas, 2)

In [17]:
print(alphas)

[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [18]:
aucs = []
sps = []
tprs = []
fprs = []
eods =[]
full_results = []

for i in alphas:
    print(i)
    results, full_result, _ = modeling.ours_cumulative(name, df, i)
    aucs.append(results[0])
    sps.append(results[1])
    tprs.append(results[2])
    fprs.append(results[3])
    eods.append(results[4])
    full_results.append(full_result)

0.0
[5]
Use estimation from 6
6
[5, 6]
Use estimation from 7
7
[5, 6, 7]
Use estimation from 8
8
[5, 6, 7, 8]
Use estimation from 9
9
[5, 6, 7, 8, 9]
Use estimation from 10
10
[5, 6, 7, 8, 9, 10]
Use estimation from 11
11
[5, 6, 7, 8, 9, 10, 11]
Use estimation from 12
12
[5, 6, 7, 8, 9, 10, 11, 12]
Use estimation from 13
13
[5, 6, 7, 8, 9, 10, 11, 12, 13]
Use estimation from 14
14
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Use estimation from 15
15
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Use estimation from 16
16
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Use estimation from 17
17
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Use estimation from 18
18
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Use estimation from 19
19
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Use estimation from 20
20
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Use estimation from 21
21
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
Use estimation from 22
22
[5, 6, 7

In [19]:
pd.DataFrame({'AUC': aucs, 'S.P' : sps, 'TPR': tprs, 'FPR': fprs, 'EoD' : eods})

Unnamed: 0,AUC,S.P,TPR,FPR,EoD
0,0.775049,0.026669,0.054424,0.023954,0.039189
1,0.775157,0.028161,0.050744,0.025451,0.038097
2,0.775247,0.029571,0.048258,0.026885,0.037571
3,0.775314,0.030507,0.045435,0.02795,0.036693
4,0.775436,0.032328,0.044143,0.029654,0.036898
5,0.775536,0.034445,0.041292,0.031855,0.036573
6,0.775625,0.035507,0.041893,0.032239,0.037066
7,0.775706,0.03717,0.041687,0.033556,0.037621
8,0.775782,0.039199,0.040493,0.03457,0.037531
9,0.775882,0.041721,0.042291,0.036653,0.039472


## Temporal Bias

In [28]:
if name == 'jigsaw':
    d = 0
    
elif name == 'adult':
    d = 9
    
    
elif name == 'funding':
    d = 9
    
metrics = 'S.P'
def calculate_temporal_Bais():
    baseline_0 = temporal_bias(full_results_0[metrics])
    baseline_1 = temporal_bias(full_results_1[metrics])
    baseline_2 = temporal_bias(full_results_2[metrics])
    ours = temporal_bias(full_results[d][metrics])
    
    temporal_bias_results = np.row_stack([baseline_0, baseline_1, baseline_2, ours])
    results_table = pd.DataFrame(temporal_bias_results, columns = ['ROC','RMSB','SD', 'MAX-MIN', 'TS','MB','MABD','AADM','MEAN_SUM','CUMSUM-Plain'])
    results_table
    
    # plt.plot(results_table['CUMSUM-Plain'][0], label='0')
    # plt.plot(results_table['CUMSUM-Plain'][1], label='1')
    # plt.plot(results_table['CUMSUM-Plain'][2], label='2')
    # plt.plot(results_table['CUMSUM-Plain'][3], label='Ours', color = 'black')
    # plt.legend()
    # plt.ylabel('Cumsum')

    return results_table[['MAX-MIN','TS','MB']]
    

In [29]:
calculate_temporal_Bais()

Unnamed: 0,MAX-MIN,TS,MB
0,0.179804,0.035328,0.109044
1,0.155096,0.037006,0.106812
2,0.08849,0.029884,0.074003
3,0.072922,0.018008,0.063185


### Read funding stuff

In [30]:
funding_baseline_0 = pd.read_csv('./results/funding_baseline0.csv')
funding_baseline_1 = pd.read_csv('./results/funding_baseline1.csv')
funding_baseline_2 = pd.read_csv('./results/funding_baseline2.csv')
our = pd.read_csv('./results/full_results_9.csv')


In [31]:
metrics = 'S.P'

baseline_0 = temporal_bias(funding_baseline_0[metrics][2:])
baseline_1 = temporal_bias(funding_baseline_1[metrics][2:])
baseline_2 = temporal_bias(funding_baseline_2[metrics])
ours = temporal_bias(our[metrics])

temporal_bias_results = np.row_stack([baseline_0, baseline_1, baseline_2, ours])
results_table = pd.DataFrame(temporal_bias_results, columns = ['ROC','RMSB','SD', 'MAX-MIN', 'TS','MB','MABD','AADM','MEAN_SUM','CUMSUM-Plain'])
results_table[['MAX-MIN','TS','MB']]

Unnamed: 0,MAX-MIN,TS,MB
0,0.430045,0.035725,0.148267
1,0.261224,0.034469,0.158445
2,0.349051,0.038842,0.218492
3,0.180944,0.029056,0.16352


In [None]:
plt.plot(results_table['CUMSUM-Plain'][0], label='0')
plt.plot(results_table['CUMSUM-Plain'][1], label='1')
plt.plot(results_table['CUMSUM-Plain'][2], label='2')
plt.plot(results_table['CUMSUM-Plain'][3], label='Ours', color = 'black')
plt.legend()
plt.ylabel('Cumsum')