In [1]:
import pandas as pd
from fcalc.classifier import BinarizedBinaryClassifier as BBC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder as OHE
from tqdm.notebook import tqdm
import numpy as np

In [2]:
def prepare_data(pdf, target_column):
    obj_columns = pdf.dtypes[pdf.dtypes == 'object'].keys()
    
    to_encode = pdf[obj_columns]
    enc = OHE(handle_unknown='ignore')
    to_encode = enc.fit_transform(to_encode)
    to_encode = pd.DataFrame.sparse.from_spmatrix(
        to_encode, columns=enc.get_feature_names_out())
    
    pdf.drop(columns=obj_columns, inplace=True)
    pdf.reset_index(drop=True, inplace=True)
    to_encode.reset_index(drop=True, inplace=True)
    
    pdf = pd.concat([pdf, to_encode], axis=1)
    
    df = pdf.copy()
    for c in df.columns:
        if df[c].nunique() > 2:
            med = df[c].median()
            df[c] = df[c].apply(lambda x: 1 if x > med else 0)
    
    cols = df.columns.tolist()
    cols.remove(target_column)
    
    X = df[cols]
    y = df[[target_column]]
    return X, y[target_column]

In [14]:
def parameter_testing(X, y):
    alphas = [0, 0.5, 1, 2, 5]

    X = np.array_split(X.to_numpy().astype(int), 5)
    y = np.array_split(y.to_numpy().astype(int), 5)
    
    for alpha in alphas:
        print(f'Testing with alpha = {alpha}')
        acc = 0
        for i in tqdm(range(5)):
            X_train = np.concatenate([X[j] for j in range(5) if j != i], axis=0)
            y_train = np.concatenate([y[j] for j in range(5) if j != i], axis=0)

            X_test = X[i]
            y_test = y[i]
            
            bbc = BBC(X_train, y_train, alpha=alpha)
            bbc.predict(X_test)
            pred = bbc.predictions
            acc += (pred == y_test).sum() / len(pred)
        
        print(f'Alpha: {alpha}, accuracy: {acc / 5}')

In [4]:
weather_df = pd.read_csv('data_sets/weatherAUS.csv')
bikes_df = pd.read_csv('data_sets/train.csv')
smoking_df = pd.read_csv('data_sets/smoking_driking_dataset_Ver01.csv')

In [5]:
weather_df = weather_df.dropna()
weather_df.drop(columns=['Date', 'Location'], inplace=True)
weather_df.reset_index(drop=True, inplace=True)
weather_df['RainToday'] = weather_df['RainToday'].apply(lambda x: 1 if x == 'Yes' else 0)
weather_df['RainTomorrow'] = weather_df['RainTomorrow'].apply(lambda x: 1 if x == 'Yes' else 0)
weather_df = weather_df.sample(2000)

In [6]:
X, y = prepare_data(weather_df, 'RainTomorrow')

In [7]:
parameter_testing(X, y)

Testing with alpha = 0


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 0, accuracy: 0.5365
Testing with alpha = 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 0.5, accuracy: 0.5365
Testing with alpha = 1


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 1, accuracy: 0.0
Testing with alpha = 2


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 2, accuracy: 0.0
Testing with alpha = 5


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 5, accuracy: 0.0


In [8]:
bikes_df.head()

Unnamed: 0,Id,time,T_min,T_mean,T_max,P_min,P_mean,P_max,U_min,U_mean,...,N_min,N_mean,N_max,VV_min,VV_mean,VV_max,RRR_min,RRR_mean,RRR_max,y
0,0,2021-04-05 00:00:00+03:00,4.1,5.5,8.2,747.4,749.2125,751.9,58,72.125,...,0.6,0.89375,1.0,8.0,9.75,10.0,0.05,1.025,2.0,1
1,1,2021-04-06 00:00:00+03:00,12.4,17.375,22.1,766.3,767.9625,769.4,29,45.25,...,0.0,0.2875,0.6,10.0,10.0,10.0,0.0,0.0,0.0,1
2,2,2021-04-07 00:00:00+03:00,16.4,18.8,22.6,759.3,759.885714,760.6,61,72.428571,...,0.25,0.657143,0.95,10.0,10.0,10.0,0.0,0.0,0.0,0
3,3,2021-04-08 00:00:00+03:00,14.2,17.6375,22.8,758.9,760.975,762.5,42,68.0,...,0.25,0.50625,0.95,10.0,10.0,10.0,0.0,0.15,0.3,0
4,4,2021-04-09 00:00:00+03:00,6.1,7.7875,9.6,750.1,753.6125,757.3,68,80.625,...,0.5,0.8,0.95,4.1,9.2625,10.0,2.0,3.5,5.0,1


In [9]:
bikes_df.drop(columns=['Id', 'time'], inplace=True)
bikes_df = bikes_df.dropna()

In [10]:
len(bikes_df)

256

In [11]:
bikes_df.head()

Unnamed: 0,T_min,T_mean,T_max,P_min,P_mean,P_max,U_min,U_mean,U_max,Ff_min,...,N_min,N_mean,N_max,VV_min,VV_mean,VV_max,RRR_min,RRR_mean,RRR_max,y
0,4.1,5.5,8.2,747.4,749.2125,751.9,58,72.125,87,5,...,0.6,0.89375,1.0,8.0,9.75,10.0,0.05,1.025,2.0,1
1,12.4,17.375,22.1,766.3,767.9625,769.4,29,45.25,65,3,...,0.0,0.2875,0.6,10.0,10.0,10.0,0.0,0.0,0.0,1
2,16.4,18.8,22.6,759.3,759.885714,760.6,61,72.428571,80,1,...,0.25,0.657143,0.95,10.0,10.0,10.0,0.0,0.0,0.0,0
3,14.2,17.6375,22.8,758.9,760.975,762.5,42,68.0,82,2,...,0.25,0.50625,0.95,10.0,10.0,10.0,0.0,0.15,0.3,0
4,6.1,7.7875,9.6,750.1,753.6125,757.3,68,80.625,85,6,...,0.5,0.8,0.95,4.1,9.2625,10.0,2.0,3.5,5.0,1


In [12]:
X, y = prepare_data(bikes_df, 'y')

In [15]:
parameter_testing(X, y)

Testing with alpha = 0


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 0, accuracy: 0.4178733031674208
Testing with alpha = 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 0.5, accuracy: 0.4178733031674208
Testing with alpha = 1


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 1, accuracy: 0.0
Testing with alpha = 2


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 2, accuracy: 0.0
Testing with alpha = 5


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 5, accuracy: 0.0


In [16]:
smoking_df.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,Y
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,N
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,N
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,N
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,N


In [17]:
smoking_df = smoking_df.dropna()
smoking_df['DRK_YN'] = smoking_df['DRK_YN'].apply(lambda x: 1 if x == 'Y' else 0)
smoking_df = smoking_df.sample(2000)

In [18]:
X, y = prepare_data(smoking_df, 'DRK_YN')

In [19]:
parameter_testing(X, y)

Testing with alpha = 0


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 0, accuracy: 0.192
Testing with alpha = 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 0.5, accuracy: 0.192
Testing with alpha = 1


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 1, accuracy: 0.0
Testing with alpha = 2


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 2, accuracy: 0.0
Testing with alpha = 5


  0%|          | 0/5 [00:00<?, ?it/s]

Alpha: 5, accuracy: 0.0
