In [172]:
%run functions.ipynb

In [173]:
#Select significant columns    
significant_columns = [
"known_col_0",
"known_col_1",
"known_col_3",
"known_col_4",]

In [174]:
def do_everything(dataset):
    #Data Preprocessing
    a, r, r_dev, r_test = data_preprocessing(dataset, "is_accepted", "y", 0.8)
    
    #Create rej datase with and without id
    r_dev_mod, r_test_mod = select_columns_rejects_without_id(
        r_dev, r_test, "r_dev_mod", "r_test_mod"
    )
    r_dev_mod_id, r_test_mod_id = select_columns_rejects_with_id(
        r_dev, r_test, "r_dev_mod_id", "r_test_mod_id"
    )
    
    #Balance
    X_res, y_res = balance(a)
    
    #Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=7)
    columns = X_train.columns

    # Columns
    X_train = pd.DataFrame(data=X_train, columns=columns)
    y_train = pd.DataFrame(data=y_train, columns=["target"])
    
    X_train = X_train[significant_columns]
    X_test_3 = X_test[significant_columns]
    r_dev_mod = r_dev_mod[significant_columns]
    r_test_mod = r_test_mod[significant_columns]
    
    #Logistic regression    
    logreg, y_pred = log_reg(X_train, y_train, X_test_3)
    
    #Predictions    
    pred_test1 = pred(y_test, X_test, X_test_3, logreg, r_dev_mod)
    
    return r_dev_mod, r_test_mod, r_dev_mod_id, r_test_mod_id, X_train, X_test, X_test_3, y_train, y_test, pred_test1, logreg

## Semi-Supervised Learning

In [175]:
def semi_supervised(logreg):
    # Train
    train_new_model = ssl_prep(
    X_train,
    y_train,
    r_dev_mod,
    )
    # Test
    test_new_model = ssl_prep(
    X_test_3,
    y_test,
    r_test_mod,
    )
    X_ssl, y_ssl = ssl_split(train_new_model, "unlabel")
    
    ############# SEMI-SUPERVISED: SELF-TRAINING & LABEL PROPAGATION ############
    
    #SELF-TRAINING
    self_training = ssl_model_selftraining(X_ssl, y_ssl, SelfTrainingClassifier)
    x_st = ssl_predictions_oth(self_training, X_test_3)
    
    x_st["Flag"] = x_st.apply(flag_df, axis=1)
    kickout_st = kickout(x_st)
 
    #LABEL PROPAGATION
    label_propagation = ssl_model_label(X_ssl, y_ssl, LabelPropagation)
    x_lp = ssl_predictions_oth(label_propagation, X_test_3)
    
    x_lp["Flag"] = x_lp.apply(flag_df, axis=1)
    kickout_lp = kickout(x_lp)
    
    #LABEL SPREADING
    label_spreading = ssl_model_label(X_ssl, y_ssl, LabelSpreading)
    x_sp = ssl_predictions_oth(label_spreading, X_test_3)
    
    x_sp["Flag"] = x_sp.apply(flag_df, axis=1)
    kickout_ls = kickout(x_sp)
    

    ################ SEMI-SUPERVISED: ACTIVE LEARNING #############
    
    #WITH 20% OF THE REJECTED DATA
    regressor_20 = active_learning2(X_train, y_train, r_dev_mod, 0.2)
    x_al_20 = ssl_predictions_oth2(regressor_20, X_test_3)
    x_al_20["Flag"] = x_al_20.apply(flag_df, axis=1)
    kickout_al_20 = kickout(x_al_20)
    
#     #WITH 50% OF THE REJECTED DATA
#     regressor_50 = active_learning2(X_train, y_train, r_dev_mod, 0.2)
#     x_al_50 = ssl_predictions_oth2(regressor_50, X_test_3)
#     x_al_50["Flag"] = x_al_50.apply(flag_df, axis=1)
#     kickout_al_50 = kickout(x_al_50)
 
    ############# SELECT ESTIMATORS WITH THE BEST RESULTS ############
    estimators = []
    if kickout_lp > kickout_st and kickout_lp > kickout_ls:
        estimators.append(("lp", label_propagation))
        max_ssl_kickout = kickout_lp
    elif kickout_st > kickout_ls and kickout_st > kickout_lp:
        estimators.append(("st", self_training))
        max_ssl_kickout = kickout_st
    elif kickout_ls > kickout_st and kickout_ls > kickout_lp:
        estimators.append(("ls", label_spreading))
        max_ssl_kickout = kickout_ls
    
                          
#     if kickout_al_20 > kickout_al_50:
#         estimators.append(("al_20", regressor_20))
#         max_al_kickout = kickout_al_20
#     else:
#         estimators.append(("al_50", regressor_50))
#         max_al_kickout = kickout_al_50
                  
    ############# ENSEMBLING ############
    
    #x_ds = ssl_predictions_ds(X_test_3, estimators)
    #x_ds["Flag"] = x_ds.apply(flag_df, axis=1)
    #kickout_ds = kickout(x_ds)
    return kickout_st, kickout_lp, kickout_ls, max_ssl_kickout, kickout_al_20

## Run for all datasets

In [None]:
a_directory = "C:/Users/Asus/Desktop/Repo/MasterThesis_RI/Data_28_04/"

my_dataframes = []
for filename in os.listdir(a_directory):
    filepath = os.path.join(a_directory)
    r_dev_mod, r_test_mod, r_dev_mod_id, r_test_mod_id, X_train, X_test, X_test_3, y_train, y_test, pred_test1, logreg = do_everything(filename)
    #Ri1_train    
    ri1_train = predictions1(logreg, r_dev_mod, r_dev_mod_id)
    evaluation(ri1_train, X_test)
    #Calculate kickout measures
    kickout_list = list(semi_supervised(logreg))
    new_filename = filename[:-4]
    new_filename2 = pd.DataFrame(kickout_list).transpose()
    new_filename2 = new_filename2.rename(columns={0: 'kickout_st', 1: 'kickout_lp', 2: 'kickout_ls', 3: 'max_ssl_kickout', 4: 'kickout_al_20'}, index={0: new_filename})
    my_dataframes.append(new_filename2)
df_results = pd.concat(my_dataframes, axis = 0)

In [None]:
df_results

#### treshold everywhere = median

In [141]:
df_results

Unnamed: 0,kickout_st,kickout_lp,max_ssl_kickout,kickout_al_20,kickout_al_50,max_al_kickout,kickout_ds
cons_scen1_1,0.002116,0.043117,0.043117,0.000794,0.000794,0.000794,-0.023865
cons_scen1_2,0.003983,0.054434,0.054434,0.002124,0.002124,0.002124,-0.021752
cons_scen1_3,0.005282,-0.004005,0.005282,0.001056,0.001056,0.001056,-0.022297


In [160]:
df_results

Unnamed: 0,kickout_st,kickout_lp,max_ssl_kickout,kickout_al_20,kickout_al_50,max_al_kickout,kickout_ds
cons_scen2_1,0.00225,0.022848,0.022848,0.0005,0.0005,0.0005,-0.049954
cons_scen2_2,0.003013,0.009573,0.009573,0.000502,0.000502,0.000502,-0.04659
cons_scen2_3,0.002493,-0.055176,0.002493,-0.00134,-0.00134,-0.00134,-0.043438


In [166]:
df_results

Unnamed: 0,kickout_st,kickout_lp,max_ssl_kickout,kickout_al_20,kickout_al_50,max_al_kickout,kickout_ds
cons_scen3_1,0.003859,0.049851,0.049851,0.002671,0.002671,0.002671,-0.025107
cons_scen3_2,0.005895,0.019386,0.019386,0.003537,0.003537,0.003537,-0.027741
cons_scen3_3,0.009788,-0.053009,0.009788,0.005932,0.005932,0.005932,-0.022824


In [148]:
df_results

Unnamed: 0,kickout_st,kickout_lp,max_ssl_kickout,kickout_al_20,kickout_al_50,max_al_kickout,kickout_ds
paper_1,-0.000776,-0.023881,-0.000776,-0.000253,-0.000253,-0.000253,-0.207248
paper_2,0.000483,-0.023012,0.000483,0.000714,0.000714,0.000714,-0.201223
paper_3,0.001589,-0.045014,0.001589,0.000275,0.000275,0.000275,-0.120312


In [154]:
df_results

Unnamed: 0,kickout_st,kickout_lp,max_ssl_kickout,kickout_al_20,kickout_al_50,max_al_kickout,kickout_ds
mfi_1,0.008677,0.0159,0.0159,0.000404,0.000404,0.000404,-0.089392
mfi_2,0.008808,-0.036539,0.008808,0.000396,0.000396,0.000396,-0.090937
mfi_3,0.000549,-0.087811,0.000549,0.001,0.001,0.001,-0.066867
