In [41]:
import pandas as pd 
import os
import json
import numpy as np 

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from scipy.stats import pearsonr, spearmanr

In [2]:
df = pd.read_csv('./fraud_oracle.csv')

In [3]:
df

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,Nov,4,Friday,Toyota,Urban,Tuesday,Nov,5,Male,Married,...,6 years,31 to 35,No,No,External,none,no change,1 vehicle,1996,Collision
15416,Nov,5,Thursday,Pontiac,Urban,Friday,Dec,1,Male,Married,...,6 years,31 to 35,No,No,External,more than 5,no change,3 to 4,1996,Liability
15417,Nov,5,Thursday,Toyota,Rural,Friday,Dec,1,Male,Single,...,5 years,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1996,Collision
15418,Dec,1,Monday,Toyota,Urban,Thursday,Dec,2,Female,Married,...,2 years,31 to 35,No,No,External,more than 5,no change,1 vehicle,1996,All Perils


In [4]:

age_groups = {
    '16-17': 'Young',
    '18-20': 'Young',
    '21-25': 'Young',
    '26-30': 'Young',
    '31-35': 'Middle-aged',
    '36-40': 'Middle-aged',
    '41-50': 'Middle-aged',
    '51-65': 'Senior',
    '65+': 'Senior'
}
df['AgeGroup'] = df['AgeOfPolicyHolder'].map(age_groups)
df.drop(columns=['AgeOfPolicyHolder'], inplace=True)


def categorize_range(value):
    if value == 'less than 20000':
        return 'low'
    elif value in ['20000 to 29000', '30000 to 39000']:
        return 'mid'
    elif value in ['40000 to 59000', '60000 to 69000', 'more than 69000']:
        return 'high'
    else:
        return 'mid'


df['VehiclePrice_Cat'] = df['VehiclePrice'].apply(categorize_range)


In [5]:

label_encoder = LabelEncoder()


col_ordering = [
    {'col':'PastNumberOfClaims','mapping':{'none':0 ,'1':1,'2 to 4':3,'more than 4':5 }},
    {'col':'NumberOfSuppliments','mapping':{'none':0,'1 to 2':1,'3 to 5':3,'more than 5':6}},
    {'col':'VehiclePrice','mapping':{'more than 69000':5,'20000 to 29000':1,'30000 to 39000':2,'less than 20000':0,
                                     '40000 to 59000':3,'60000 to 69000':4}},
    {'col':'AgeOfVehicle','mapping':{'new': 0, '2 years': 1, '3 years': 2, '4 years': 3, '5 years': 4, '6 years': 5, '7 years': 6, 'more than 7': 7}},
    {'col':'Year','mapping': {1994: 0, 1995: 1, 1996: 2}},
    {'col':'Days_Policy_Accident','mapping': {'none': 0, '1 to 7': 1,'8 to 15': 2,'15 to 30': 3, 'more than 30': 4}},
    {'col':'Days_Policy_Claim','mapping': {'none': 0, '1 to 7': 1,'8 to 15': 2,'15 to 30': 3, 'more than 30':4 }},
    {'col':'AddressChange_Claim','mapping': {'1 year': 1, 'no change': 0, '4 to 8 years': 4, '2 to 3 years': 2, 'under 6 months': 0.5}},
    {'col':'AgeGroup','mapping': {'Young': 0, 'Middle-aged': 1, 'Senior': 2}},
    {'col':'VehiclePrice_Cat','mapping': {'high': 2, 'mid': 1, 'low': 0}},
    {'col':'NumberOfCars','mapping': {'3 to 4': 3, '1 vehicle': 1, '2 vehicles': 2, '5 to 8': 7, 'more than 8': 9}},
]
ord_encoder = OrdinalEncoder(mapping = col_ordering, return_df=True)

df = ord_encoder.fit_transform(df)




for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = label_encoder.fit_transform(df[col])



  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [6]:
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,AgeGroup,VehiclePrice_Cat
0,2,5,6,6,1,6,5,1,0,2,...,0,0,0,0,1,3,0,2,-1.0,2
1,4,3,6,6,1,2,5,4,1,2,...,1,0,0,0,0,1,0,1,-1.0,2
2,10,5,0,6,1,5,10,2,1,1,...,0,0,0,0,0,1,0,1,-1.0,2
3,6,2,2,17,0,1,6,1,1,1,...,1,0,0,6,0,1,0,2,-1.0,1
4,4,5,1,6,1,6,4,2,0,2,...,0,0,0,0,0,1,0,1,-1.0,2


In [7]:
df.rename(columns = {'FraudFound_P':'Fraud'},inplace = True)


In [8]:
df.shape

(15420, 34)

In [None]:

def add_gaussian_noise(df, columns, noise_level=0.01):
    for col in columns:
        noise = np.random.normal(0, noise_level, size=df[col].shape)
        df[col] += noise
    return df

def split_random_(df):

    x_1, x_2 = train_test_split(df[df.Fraud == 1], test_size=0.4, random_state=3)
    x_3, x_4 = train_test_split(df[df.Fraud == 0 ], test_size = 0.6, random_state=33)
    data_1 = pd.concat([x_1,x_3])
    data_2 = pd.concat([x_2, x_4])
    return data_1, data_2

def prepare_data(df, X_test):
    scale = MinMaxScaler()
    smote = SMOTE(random_state=22)

 
    X_train, y_train = df.drop(columns="Fraud"), df.Fraud
    X_train_scale = scale.fit_transform(X_train)
    X_test_scale = scale.transform(X_test)


    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly.fit_transform(X_train_scale)


    X_smote, y_smote = smote.fit_resample(X_train_poly, y_train)

   
    X_test_poly = poly.transform(X_test_scale)

    return X_smote, y_smote, X_test_poly

def fit_predict(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=250)
    roc_auc_cv = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
    print(f"Кросс-валидация ROC-AUC: {roc_auc_cv}")
    print(f"Среднее значение ROC-AUC: {np.mean(roc_auc_cv):.4f}")

    model.fit(X_train, y_train)


    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)


    roc_auc_test = roc_auc_score(y_test, y_pred_proba)
    logloss_test = log_loss(y_test, y_pred_proba)
    accuracy_test = accuracy_score(y_test, y_pred)
    f1_test = f1_score(y_test, y_pred)

    params = model.coef_

    return params, logloss_test, roc_auc_test, accuracy_test, f1_test

---

* Глава 1 
    - Non - iid

---

In [43]:
X_, X_test, y_, y_test = train_test_split(df.drop(columns="Fraud"), df.Fraud, test_size=0.2, random_state=231)

In [44]:
df_clients = pd.concat([X_, y_],axis=1)


In [45]:
df_1, df_2 = split_random_(df_clients)



In [33]:
(df_1.shape, df_2.shape)

((5080, 34), (7256, 34))

In [34]:
col_1 = df_1.drop(columns="Fraud").columns
col_2 = df_2.drop(columns="Fraud").columns

In [46]:
noise_df_2 = add_gaussian_noise(df_2,
                                col_1,
                                noise_level=1.4)
noise_df_1 = add_gaussian_noise(df_1,
                                col_2,
                                noise_level=1.8)

In [36]:
noise_df_1

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,AgeGroup,VehiclePrice_Cat,Fraud
309,0.734349,4.048807,1.221545,6.882615,2.801271,3.933078,7.038809,0.905393,-0.324119,2.633846,...,-1.883266,2.247889,1.774919,-1.682892,1.774565,0.042548,1.744103,-0.708674,3.041924,1
11981,3.748855,-0.296029,1.104299,12.930857,1.072084,4.679851,9.039946,2.326195,1.833083,3.722608,...,0.288208,-1.425556,-0.440802,-0.300771,0.952763,0.303650,-2.072660,-1.437418,0.088634,1
10412,1.333618,2.878006,-1.438579,7.362529,-1.321906,5.062851,6.902683,2.427890,4.382457,1.582227,...,-0.845976,-0.762951,5.279886,1.245057,0.437348,0.582680,-3.586312,-1.277210,0.908622,1
13425,6.071326,0.660588,1.226287,15.932052,1.314730,0.187784,7.620481,4.759858,2.839697,0.713714,...,0.718202,-1.553362,1.188415,-0.384724,2.575076,0.480611,-0.311926,0.231515,-0.567655,1
13630,6.350177,3.788122,1.541779,6.265255,1.975950,0.733800,12.409587,0.524075,-0.649967,0.222363,...,2.851618,-1.565754,4.176311,-0.839116,0.924756,-0.510392,0.741787,-1.442627,0.398100,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2096,7.798982,2.526046,2.970881,6.893318,-1.129515,0.235625,8.606206,1.576792,-1.770885,2.742173,...,-0.016981,-0.683677,2.852990,-0.898194,-0.479254,-0.022088,0.080650,-3.744192,-0.692975,0
3119,7.327865,-0.376865,2.699678,13.954586,-0.296560,2.372599,8.907336,2.286551,0.063891,0.342788,...,1.685171,3.389431,0.614086,1.337380,-0.832471,0.316784,2.831905,0.312757,0.664271,0
12282,-0.942894,3.684849,5.741542,2.473060,2.029032,-2.260720,6.948571,2.745391,0.005597,1.622210,...,1.060632,1.564786,7.120507,-0.938739,3.216007,3.773955,-3.159097,-2.775773,0.232724,0
2523,3.125004,2.575865,3.681680,7.116656,0.615496,0.548456,8.694933,-0.393825,-0.718873,1.645031,...,1.355267,4.690177,5.148792,1.156489,1.326028,0.181204,6.206048,2.387287,-0.672031,0


In [17]:
X_train_1, y_train_1, X_test_1 = prepare_data(df_1, X_test)


In [19]:
_, loss_1, roc_auc_1, accuracy_1, f1_score_1 = fit_predict(X_train_1, X_test_1, y_train_1, y_test)
(loss_1, roc_auc_1, accuracy_1, f1_score_1)

(0.7717916020834025,
 0.6964398916551406,
 0.44163424124513617,
 0.15255905511811024)

In [20]:
X_train_2, y_train_2, X_test_2 = prepare_data(df_2, X_test)


In [21]:
_, loss_2, roc_auc_2, accuracy_2, f1_score_2 = fit_predict(X_train_2, X_test_2, y_train_2, y_test)
(loss_2, roc_auc_2, accuracy_2, f1_score_2)

(0.7322310144564195,
 0.671855189603504,
 0.5119974059662775,
 0.15591699383062255)

In [47]:
X_train_1_noise, y_train_1_noise, X_test_1_nosie = prepare_data(noise_df_1, X_test)
params, loss_1_noise, roc_auc_1_noise, accuracy_1_noise, f1_score_1_noise = fit_predict(X_train_1_noise, X_test_1_nosie, y_train_1_noise, y_test)
(loss_1_noise, roc_auc_1_noise, accuracy_1_noise, f1_score_1_noise)


Кросс-валидация ROC-AUC: [0.79337719 0.80917677 0.81080814 0.81671745 0.81348585]
Среднее значение ROC-AUC: 0.8087


(0.7939887449465851,
 0.655622978737266,
 0.3949416342412451,
 0.13850415512465375)

In [19]:
params.size

594

In [48]:
X_train_2_noise, y_train_2_noise, X_test_2_nosie = prepare_data(noise_df_2, X_test)
params, loss_2_noise, roc_auc_2_noise, accuracy_2_noise, f1_score_2_noise = fit_predict(X_train_2_noise, X_test_2_nosie, y_train_2_noise, y_test)
(loss_2_noise, roc_auc_2_noise, accuracy_2_noise, f1_score_2_noise)

Кросс-валидация ROC-AUC: [0.85925062 0.87490975 0.86824104 0.87733464 0.8846021 ]
Среднее значение ROC-AUC: 0.8729


(0.6629745025902152,
 0.6688992929313038,
 0.5817120622568094,
 0.16558861578266496)

In [21]:
params.size

594

In [39]:
all_data_noise = pd.concat([noise_df_1, noise_df_2])

In [40]:
X_train_all_noise, y_train_all_noise, X_test_all_nosie = prepare_data(all_data_noise, X_test)
_, loss_all_noise, roc_auc_all_noise, accuracy_all_noise, f1_score_all_noise = fit_predict(X_train_all_noise, X_test_all_nosie, y_train_all_noise, y_test)
(loss_all_noise, roc_auc_all_noise, accuracy_all_noise, f1_score_all_noise)

(0.684003755692351,
 0.6916677374344645,
 0.5664721141374838,
 0.17316017316017315)

* Глава 2 
    - IID

In [170]:
X_1, X_2, y_1, y_2 = train_test_split(df_clients.drop(columns="Fraud"), df_clients.Fraud, test_size=0.5,random_state=2211)

In [174]:
df_1_iid = pd.concat([X_1, y_1],axis=1)

In [175]:
df_2_iid = pd.concat([X_2, y_2],axis=1) 

In [176]:
X_train_1_iid, y_train_1_iid, X_test_1_iid = prepare_data(df_1_iid, X_test)
loss_1, roc_auc_1, accuracy_1, f1_score_1 = fit_predict(X_train_1_iid, X_test_1_iid, y_train_1_iid, y_test)
(loss_1, roc_auc_1, accuracy_1, f1_score_1)

(0.4677620247839864,
 0.7986831489165516,
 0.7519455252918288,
 0.24778761061946902)

In [178]:
X_train_2_iid, y_train_2_iid, X_test_2_iid = prepare_data(df_2_iid, X_test)
loss_2, roc_auc_2, accuracy_2, f1_score_2 = fit_predict(X_train_2_iid, X_test_2_iid, y_train_2_iid, y_test)
(loss_2, roc_auc_2, accuracy_2, f1_score_2)

(0.4614940804301869,
 0.7858171968649148,
 0.7538910505836576,
 0.24477611940298508)

In [230]:
noise_df_1.to_csv('NON_IID_1.csv', index=False)
noise_df_2.to_csv('NON_IID_2.csv', index=False)