# Loading Files

In [None]:
# Base
import os

# Data processing
import numpy as np
import pandas as pd

pd.set_option('mode.chained_assignment', None) # turn off some annoying warnings

# Number of avialable cores
nslots = int(os.getenv('NSLOTS', 2))
print('cores:' ,nslots, ', dir:', os.getcwd()) # should be '/gpfs0/shai/users/barryb/link-predict'

# Force reload modules each execution
%load_ext autoreload
%autoreload 2

# Load custom modules
from helper.ml_class import LinkPredict
from helper.custom_cv import CustomGroupCV, CustomGroupSplit
from helper.base import load_data

In [None]:
# Load dataframe
meta, df_initial = load_data(
    path_meta = 'data/processed/networks/subsamples_metadata.csv',
    path_subsample = 'data/processed/networks/subsamples_edge_lists.csv',
    paths_features = ['data/processed/features/features_py.csv',
                      'data/processed/features/features_R.csv'],
)

# Setting important variables

In [48]:
# Path to save results
results_path = 'results/'

# Columns to ignore during training
columns_ignore = ['name', 'link_ID', 'subsample_ID', 'fraction', 'repetition', 'higher_level', 'lower_level', 'weight']

# Modeling and predicting

### Use case


In [3]:
df = df_initial.copy()

# Drop features
to_drop = ['flow_infomap_HL', 'flow_infomap_LL', 'modular_centrality_infomap_HL', 'modular_centrality_infomap_LL']
df = df.drop(to_drop, axis=1)

df['Fisher alpha'] = df['Fisher alpha'].astype('float64')

# TODO: Change this in the first step as it is confusing
df['class'] = df['class'].astype('int64')

# Move 'class' to the end
df = df[[c for c in df if c not in ['class']] + ['class']]

In [4]:
cgs = CustomGroupSplit(group_by='name', 
                       stratify_by='community', 
                       fractions_col = 'fraction',
                       train_size=0.7, 
                       undersample_ratio=None,
                       fractions_train = [1],
                       fractions_test = [0.8], 
                       groups_train = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
                       groups_test = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
                       drop_isolates=False,
                       drop_existing_links = True,
                       keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
                       keep_fractions_test=False,
                       random_state=42,
                       )

train_idx, test_idx  = next(cgs.split(df))
train_link_id, test_link_id  = cgs.get_link_ids(0)

X_train, y_train = LinkPredict().subset_data(df, train_link_id, cast_target=False)
X_test, y_test = LinkPredict().subset_data(df, test_link_id, cast_target=True)

In [5]:
cgc_inner = CustomGroupCV(
    group_by='name', 
    stratify_by='community', 
    fractions_col = 'fraction',
    n_splits = 3,
    fractions_train = [1],
    fractions_test = [0.8],
    groups_train = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    groups_test = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    drop_isolates=False,
    drop_existing_links = True,
    )

In [6]:
cgc_outer = CustomGroupCV(
    group_by='name', 
    stratify_by='community', 
    fractions_col = 'fraction',
    n_splits = 5,
    fractions_train = [1],
    fractions_test = [0.8],
    groups_train = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    groups_test = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    drop_isolates=False,
    drop_existing_links = True,
    keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
    keep_fractions_test=False
    )

### Train-Test split + Cross Validation

In [None]:
# ml = LinkPredict('RandomForestClassifier')

# ml.fit(X_train,
#        y_train,
#        cv=cgc_outer,
#        class_weight='balanced',
#        tuner_name='RandomizedSearchCV',
#        columns_to_ignore=columns_ignore)

# ml.multi_plot(X_test, y_test, threshold=0.5, 
#     plots=['confusion_matrix', 'grouped_evaluation', 'roc_curve', 'pr_curve', 'probs_distribution', 'roc_curve_split', 'pr_curve_split', 'grouped_evaluation_split', 'feature_importance'])


### Nested CV

In [None]:
# ml = LinkPredict('RandomForestClassifier')
# X, y = LinkPredict().subset_data(df, cast_target=False)

# ml.fit(X,
#        y,
#        cv_inner=cgc_inner,
#        cv_outer=cgc_outer, # Nested cross-validation
#        class_weight='balanced',
#        tuner_name='RandomizedSearchCV',
#        columns_to_ignore=columns_ignore)

In [314]:
# # Save results
# results_df = pd.DataFrame({'fold':[]})
# results_link_ids = cgc_outer.get_link_ids()[1]

# for fold in range(cgc_outer.get_n_splits()):
    
#     test_links = results_link_ids[fold]
#     X_test_subset = X[X['link_ID'].isin(test_links)]#.drop(['community'], axis=1)
#     ml.set_fold_model(fold)
    
#     y_proba = ml.predict_proba(X_test_subset)
#     df_len = len(y_proba)
#     results_fold_df = pd.DataFrame({
#         'fold':[fold]*df_len, 
#         'link_ID':X_test_subset['link_ID'],
#         'type_train':[cgc_outer.groups_train]*df_len,
#         'type_test':[cgc_outer.groups_test]*df_len,
#         'frac_train':[cgc_outer.fractions_train]*df_len,
#         'frac_test':[cgc_outer.fractions_test]*df_len,
#         'model':[ml.model_name]*df_len,
#         'y_proba':y_proba,
#     })
#     results_df = pd.concat([results_df, results_fold_df], axis=0)

In [315]:
# results_df.to_csv(results_path+'/raw/results_nested_cv.csv', index=False)

### Compare all community types 

In [16]:
from itertools import combinations

# Set netwrok types
network_types = communities = [
    'Host-Parasite', 'Plant-Pollinator', 'Plant-Seed Dispersers', 'Plant-Herbivore',
]

# Create combinations of network types for training and testing
types_combinations = []
for i in [1]: # remove last arg for all combs
    types_combinations += list(combinations(network_types, i))

types_combinations += [('Plant-Seed Dispersers', 'Plant-Pollinator', 'Plant-Herbivore', 'Host-Parasite')] # Ecological
types_combinations+= [('Plant-Seed Dispersers', 'Plant-Herbivore', 'Host-Parasite')] # Ecological-Non-Plant-Pollinator

In [None]:
trained_models_types=[]
cgc_list = []

X, y = LinkPredict().subset_data(df, cast_target=False)

for types_train in types_combinations:
    for types_test in types_combinations:
        
        print('Type train:', types_train, '\nType test:', types_test, '\n')
        
        cgc_inner = CustomGroupCV(
            group_by='name', 
            stratify_by='community', 
            fractions_col = 'fraction',
            n_splits = 3,
            fractions_train = [1],
            fractions_test = [0.8],
            groups_train = types_train,
            groups_test = types_train,
            drop_isolates=False,
            drop_existing_links = True
            )

        cgc_outer = CustomGroupCV(
            group_by='name', 
            stratify_by='community', 
            fractions_col = 'fraction',
            n_splits = 5,
            fractions_train = [1],
            fractions_test = [0.8],
            groups_train = types_train,
            groups_test = types_test,
            drop_isolates=False,
            drop_existing_links = True,
            keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
            keep_fractions_test=False
            )

        ml = LinkPredict('RandomForestClassifier')

        # Train
        ml.fit(X,
            y,
            cv_inner=cgc_inner,
            cv_outer=cgc_outer,
            scorer_metric='f1',
            class_weight='balanced',
            tuner_name='RandomizedSearchCV',
            columns_to_ignore=columns_ignore)

        # Save model
        ml.types_train = types_train
        ml.types_test = types_test
        trained_models_types.append(ml) 
        cgc_list.append(cgc_outer)
        

In [19]:
# Save results
results_df = pd.DataFrame() #{'fold':[]}

for ml, cgc_outer in zip(trained_models_types, cgc_list):

    results_link_ids = cgc_outer.get_link_ids()[1]

    for fold in range(cgc_outer.get_n_splits()):

        test_links = results_link_ids[fold]
        X_test_subset = X[X['link_ID'].isin(test_links)]
        ml.set_fold_model(fold)
        
        y_proba = ml.predict_proba(X_test_subset)
        df_len = len(y_proba)
        results_run_df = pd.DataFrame({
            'fold':[fold]*df_len, 
            'link_ID':X_test_subset['link_ID'],
            'type_train':[list(cgc_outer.groups_train)]*df_len,
            'type_test':[list(cgc_outer.groups_test)]*df_len,
            # 'frac_train':[cgc_outer.fractions_train]*df_len,
            # 'frac_test':[cgc_outer.fractions_test]*df_len,
            # 'model':[ml.model_name]*df_len,
            'y_proba':y_proba,
        })
        results_df = pd.concat([results_df, results_run_df], axis=0)

In [21]:
results_df.to_csv(results_path+'/raw/results_domains_F1.csv', index=False)

new: for balanced accuracy

In [None]:
trained_models_types_BA=[]
cgc_list_BA = []

X, y = LinkPredict().subset_data(df, cast_target=False)

for types_train in types_combinations:
    for types_test in types_combinations:
        
        print('Type train:', types_train, '\nType test:', types_test, '\n')
        
        cgc_inner = CustomGroupCV(
            group_by='name', 
            stratify_by='community', 
            fractions_col = 'fraction',
            n_splits = 3,
            fractions_train = [1],
            fractions_test = [0.8],
            groups_train = types_train,
            groups_test = types_train,
            drop_isolates=False,
            drop_existing_links = True
            )

        cgc_outer = CustomGroupCV(
            group_by='name', 
            stratify_by='community', 
            fractions_col = 'fraction',
            n_splits = 5,
            fractions_train = [1],
            fractions_test = [0.8],
            groups_train = types_train,
            groups_test = types_test,
            drop_isolates=False,
            drop_existing_links = True,
            keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
            keep_fractions_test=False
            )

        ml = LinkPredict('RandomForestClassifier')

        # Train
        ml.fit(X,
            y,
            cv_inner=cgc_inner,
            cv_outer=cgc_outer,
            scorer_metric='balanced_accuracy',
            class_weight='balanced',
            tuner_name='RandomizedSearchCV',
            columns_to_ignore=columns_ignore)

        # Save model
        ml.types_train = types_train
        ml.types_test = types_test
        trained_models_types_BA.append(ml) 
        cgc_list_BA.append(cgc_outer)
        
# Save results
results_df = pd.DataFrame() #{'fold':[]}

for ml, cgc_outer in zip(trained_models_types_BA, cgc_list_BA):

    results_link_ids = cgc_outer.get_link_ids()[1]

    for fold in range(cgc_outer.get_n_splits()):

        test_links = results_link_ids[fold]
        X_test_subset = X[X['link_ID'].isin(test_links)]
        ml.set_fold_model(fold)
        
        y_proba = ml.predict_proba(X_test_subset)
        df_len = len(y_proba)
        results_run_df = pd.DataFrame({
            'fold':[fold]*df_len, 
            'link_ID':X_test_subset['link_ID'],
            'type_train':[list(cgc_outer.groups_train)]*df_len,
            'type_test':[list(cgc_outer.groups_test)]*df_len,
            # 'frac_train':[cgc_outer.fractions_train]*df_len,
            # 'frac_test':[cgc_outer.fractions_test]*df_len,
            # 'model':[ml.model_name]*df_len,
            'y_proba':y_proba,
        })
        results_df = pd.concat([results_df, results_run_df], axis=0)

results_df.to_csv(results_path+'/raw/results_domains_BA.csv', index=False)

### Compare ML algorithms


In [None]:
# Setting important variables
communities = ['Host-Parasite', 'Plant-Pollinator', 'Plant-Seed Dispersers', 'Plant-Herbivore']
classifiers = ['RandomForestClassifier', 'LogisticRegression', 'XGBClassifier']

# Store trained models in a list
trained_models_algos=[]
cgc_list = []

# Iterate over classifiers
for clf_name in classifiers:

   print('Classifier:', clf_name)
    
   cgc_inner = CustomGroupCV(
      group_by='name', 
      stratify_by='community', 
      fractions_col = 'fraction',
      n_splits = 3,
      fractions_train = [1],
      fractions_test = [0.8],
      groups_train = communities,
      groups_test = communities,
      drop_isolates=False,
      drop_existing_links = True,
      )

   cgc_outer = CustomGroupCV(
      group_by='name', 
      stratify_by='community', 
      fractions_col = 'fraction',
      n_splits = 5,
      fractions_train = [1],
      fractions_test = [0.8],
      groups_train = communities,
      groups_test = communities,
      drop_isolates=False,
      drop_existing_links = True,
      keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
      keep_fractions_test=False
      )
   
   ml = LinkPredict(clf_name) # Initialize model

   X, y = LinkPredict().subset_data(df, cast_target=False)
   
   ml.fit(X,
      y,
      cv_inner=cgc_inner,
      cv_outer=cgc_outer, # Nested cross-validation
      class_weight='balanced',
      tuner_name='RandomizedSearchCV',
      columns_to_ignore=columns_ignore)

   trained_models_algos.append(ml) # Save model
   cgc_list.append(cgc_outer)

In [200]:
# Save results
results_df = pd.DataFrame()

for ml, cgc_outer in zip(trained_models_algos, cgc_list):

    results_link_ids = cgc_outer.get_link_ids()[1]

    for fold in range(cgc_outer.get_n_splits()):

        test_links = results_link_ids[fold]
        # X_test_subset = X[X['link_ID'].isin(test_links)]
        X_test_subset, y_test_subset = ml.subset_data(df, test_links)
        ml.set_fold_model(fold)

        y_proba = ml.predict_proba(X_test_subset)
        df_len = len(y_proba)

        results_run_df = pd.DataFrame({
            'fold':[fold]*df_len,
            'link_ID':X_test_subset['link_ID'],
            'model':[ml.model_name]*df_len,
            'y_proba':y_proba,
        })

        results_df = pd.concat([results_df, results_run_df], axis=0)

In [201]:
# Voting
voting_df = results_df.groupby(['fold', 'link_ID'])['y_proba'].mean().reset_index() # Compute the mean values
voting_df['model'] = 'Voting'
results_df = pd.concat([results_df, voting_df], axis=0) # Merge with the original DataFrame

In [202]:
results_df.to_csv(results_path+'/raw/results_models.csv', index=False)

In [276]:
# Save results
params_df = pd.DataFrame()

for ml, cgc_outer in zip(trained_models_algos, cgc_list):
    for fold in range(cgc_outer.get_n_splits()):
        ml.set_fold_model(fold)

        best_params = ml.trained_model.best_params_
        params_df = pd.concat([params_df, 
                               pd.DataFrame({'model':[ml.model_name], 
                                             'fold':[fold], 
                                             'params':[best_params], 
                                             'params_dist':[ml.params_dist]})], axis=0)

Distbribution and best parameters for each model

In [287]:
# Initialize an empty dictionary to collect data
data = {}

for ml, cgc_outer in zip(trained_models_algos, cgc_list):
    # Initialize a sub-dictionary for each model
    model_data = {}
    
    for fold in range(cgc_outer.get_n_splits()):
        ml.set_fold_model(fold)

        best_params = ml.trained_model.best_params_
        
        # Update model_data with fold information for each parameter
        for param, best_value in best_params.items():
            # Simplify the parameter name by removing 'classifier__'
            simple_param = param.replace('classifier__', '')
            
            if simple_param not in model_data:
                model_data[simple_param] = {'model': ml.model_name, 'parameter': simple_param}
            
            # Adding the best value for each fold
            model_data[simple_param][f'best value (fold {fold + 1})'] = best_value

    # Add model data to the main data dictionary, handling params_dist separately
    for param, values in model_data.items():
        # Extract range for this parameter from params_dist (handle both dict and list of dicts)
        params_dist = ml.params_dist
        if isinstance(params_dist, list):  # If params_dist is a list of dictionaries
            ranges = [d.get(param) for d in params_dist if param in d]
            param_range = ranges[0] if ranges else 'No range found'  # Default message or handling if no range is found
        else:  # If params_dist is a dictionary
            param_range = params_dist.get(param, 'No range found')  # Provide a default if the param is not found
        
        values['range'] = param_range
        data[(ml.model_name, param)] = values

# Convert the collected data into a DataFrame
params_df = pd.DataFrame.from_dict(data, orient='index').reset_index(drop=True)

# change the range of the parameter 'feature_selector__k' to 'No range found'
params_df.loc[params_df['parameter'] == 'feature_selector__k', 'range'] = '[10, 20, 30, 40, 50, 70]'


In [289]:
params_df.to_csv(results_path+'/raw/params_models.csv', index=False)

### Feature importance


In [203]:
feat_importance_all_df = pd.DataFrame()

for fold in range(cgc_outer.get_n_splits()):
    for ml in trained_models_algos:
        ml.set_fold_model(fold)
        feat_importance_df = ml.feature_importance(X_train=X_train).to_frame(ml.model_name).reset_index().rename(columns={'index':'feature'}).melt(id_vars='feature', var_name='model', value_name='importance')
        feat_importance_df['fold'] = fold
        feat_importance_all_df = pd.concat([feat_importance_all_df, feat_importance_df], axis=0)

feat_importance_all_df.reset_index(drop=True).to_csv(results_path+"/raw/feature_importance_nCV.csv", index=False)

### Processing biased sampling data

In [None]:
# Load dataframe
meta_h, df_initial_h = load_data(
    path_meta = 'data/processed/networks/biased_sampling/highDegBiasSampling_metadata.csv',
    path_subsample = 'data/processed/networks/biased_sampling/highDegBiasSampling_edge_lists.csv',
    paths_features = ['data/processed/features/biased_sampling/features_highDegBiasSampling_py.csv',
                      'data/processed/features/biased_sampling/features_highDegBiasSampling_R.csv'],
)

# Load dataframe
meta_l, df_initial_l = load_data(
    path_meta = 'data/processed/networks/biased_sampling/lowDegBiasSampling_metadata.csv',
    path_subsample = 'data/processed/networks/biased_sampling/lowDegBiasSampling_edge_lists.csv',
    paths_features = ['data/processed/features/biased_sampling/features_lowDegBiasSampling_py.csv',
                      'data/processed/features/biased_sampling/features_lowDegBiasSampling_R.csv'],
)

## Some preprocessing (later I will fix it in previous steps so no preprocessing will be needed)

df_h, df_l = df_initial_h.copy(), df_initial_l.copy()

to_drop = ['density', 'common_neighbor_centrality', 'shortest_path_length', 'shortest_paths_count', 'flow_infomap_HL', 'flow_infomap_LL', 'modular_centrality_infomap_HL', 'modular_centrality_infomap_LL', 'discrepancy.HL', 'discrepancy.LL']

df_list = [df_h, df_l]

for i in range(len(df_list)):
    df_list[i] = df_list[i].drop(to_drop, axis=1, errors='ignore') # Drop features
    df_list[i]['Fisher alpha'] = df_list[i]['Fisher alpha'].astype('float64') # Convert Fisher alpha to float
    df_list[i]['class'] = df_list[i]['class'].astype('int64')

    # Move 'class' to the end
    df_list[i] = df_list[i][[c for c in df_list[i] if c not in ['class']] + ['class']]

df_h, df_l = df_list

Setting and running the model

In [None]:
cgc_inner = CustomGroupCV(
    group_by='name', 
    stratify_by='community', 
    fractions_col = 'fraction',
    n_splits = 3,
    fractions_train = [1],
    fractions_test = [0.8],
    groups_train = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    groups_test = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    drop_isolates=False,
    drop_existing_links = True,
    )

cgc_outer = CustomGroupCV(
    group_by='name', 
    stratify_by='community', 
    fractions_col = 'fraction',
    n_splits = 5,
    fractions_train = [1],
    fractions_test = [0.8],
    groups_train = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    groups_test = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    drop_isolates=False,
    drop_existing_links = True,
    keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
    keep_fractions_test=False
    )

ml = LinkPredict('RandomForestClassifier')
X, y = LinkPredict().subset_data(df_h, cast_target=False)

ml.fit(X, y,
       cv_inner=cgc_inner,
       cv_outer=cgc_outer, # Nested cross-validation
       class_weight='balanced',
       tuner_name='RandomizedSearchCV',
       columns_to_ignore=columns_ignore)


In [None]:
# Save results
results_df = pd.DataFrame({'fold':[]})
results_link_ids = cgc_outer.get_link_ids()[1]

for fold in range(cgc_outer.get_n_splits()):
    
    test_links = results_link_ids[fold]
    X_test_subset = X[X['link_ID'].isin(test_links)]#.drop(['community'], axis=1)
    ml.set_fold_model(fold)
    
    y_proba = ml.predict_proba(X_test_subset)
    df_len = len(y_proba)
    results_fold_df = pd.DataFrame({
        'fold':[fold]*df_len, 
        'link_ID':X_test_subset['link_ID'],
        'type_train':[cgc_outer.groups_train]*df_len,
        'type_test':[cgc_outer.groups_test]*df_len,
        'frac_train':[cgc_outer.fractions_train]*df_len,
        'frac_test':[cgc_outer.fractions_test]*df_len,
        'model':[ml.model_name]*df_len,
        'y_proba':y_proba,
    })
    results_df = pd.concat([results_df, results_fold_df], axis=0)

results_df.to_csv(results_path+'/raw/results_highDegBiasSampling.csv', index=False)

In [None]:
cgc_inner = CustomGroupCV(
    group_by='name', 
    stratify_by='community', 
    fractions_col = 'fraction',
    n_splits = 3,
    fractions_train = [1],
    fractions_test = [0.8],
    groups_train = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    groups_test = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    drop_isolates=False,
    drop_existing_links = True,
    )

cgc_outer = CustomGroupCV(
    group_by='name', 
    stratify_by='community', 
    fractions_col = 'fraction',
    n_splits = 5,
    fractions_train = [1],
    fractions_test = [0.8],
    groups_train = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    groups_test = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore'],
    drop_isolates=False,
    drop_existing_links = True,
    keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
    keep_fractions_test=False
    )

ml = LinkPredict('RandomForestClassifier')
X, y = LinkPredict().subset_data(df_l, cast_target=False)

ml.fit(X,
       y,
       cv_inner=cgc_inner,
       cv_outer=cgc_outer, # Nested cross-validation
       class_weight='balanced',
       tuner_name='RandomizedSearchCV',
       columns_to_ignore=columns_ignore)

In [None]:
# Save results
results_df = pd.DataFrame({'fold':[]})
results_link_ids = cgc_outer.get_link_ids()[1]

for fold in range(cgc_outer.get_n_splits()):
    
    test_links = results_link_ids[fold]
    X_test_subset = X[X['link_ID'].isin(test_links)]#.drop(['community'], axis=1)
    ml.set_fold_model(fold)
    
    y_proba = ml.predict_proba(X_test_subset)
    df_len = len(y_proba)
    results_fold_df = pd.DataFrame({
        'fold':[fold]*df_len, 
        'link_ID':X_test_subset['link_ID'],
        'type_train':[cgc_outer.groups_train]*df_len,
        'type_test':[cgc_outer.groups_test]*df_len,
        'frac_train':[cgc_outer.fractions_train]*df_len,
        'frac_test':[cgc_outer.fractions_test]*df_len,
        'model':[ml.model_name]*df_len,
        'y_proba':y_proba,
    })
    results_df = pd.concat([results_df, results_fold_df], axis=0)

results_df.to_csv(results_path+'/raw/results_lowDegBiasSampling.csv', index=False)

### Transductive

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def transductive_learning(df):

    # Split data
    X, y = df.iloc[:,:-1], df.iloc[:,-1]

    ## Set the training set
    X_train = X.copy()
    y_train = y.copy()

    ## Set the test set
    X_test = X.loc[y != 1]
    y_test = y.loc[X_test.index]

    # Avoid errors during fitting, happens when using cv in some cases
    # X_train, y_train = X_train.reset_index(drop=True), y_train.reset_index(drop=True)

    # Relabel links
    y_train[y_train == -1] = 0
    y_test[y_test == -1] = 1

    # Set a transformer
    preprocessor = LinkPredict().set_transformer(X_train, 
                                                columns_to_ignore=columns_ignore,
                                                save=False,
                                                return_preprocessor=True)

    # RandomForestClassifier
    param_dist = {
        'feature_selector__k': [10, 20, 30, 40, 50],  # Number of features to select
        'classifier__n_estimators': [int(x) for x in np.linspace(start = 5, stop = 100, num = 5)],
        'classifier__max_features': ['sqrt', 'log2'],
        'classifier__max_depth': [int(x) for x in np.linspace(10, 110, num = 11)] + [None],
        'classifier__min_samples_split': [1 ,2, 3, 4, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 3, 4, 5, 10],
        'classifier__max_samples': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
        'classifier__max_leaf_nodes': [2, 4, 8, 16, 32, 64, 128],
        'classifier__bootstrap': [True, False],
        'classifier__criterion': ["gini", "entropy"],
    }
    rf = RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced')

    # Create pipeline
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("feature_selector", SelectKBest(mutual_info_classif)),
        ("classifier", rf)
    ])

    # RandomizedSearchCV
    clf = RandomizedSearchCV(pipe, param_distributions=param_dist, scoring='f1', cv=3, random_state=42, n_jobs=-1)
    clf.fit(X_train.reset_index(drop=True), y_train.reset_index(drop=True))

    y_proba = clf.predict_proba(X_test)[:,1]

    results_df = pd.DataFrame({
        'link_ID':X_test['link_ID'],
        'ML_single':y_proba,
    })
    return results_df
    

# Save results
results_noBias = pd.DataFrame()
results_highDegBias = pd.DataFrame()
results_lowDegBias = pd.DataFrame()

subsamples = df[(df['fraction'] == 0.8) & (df['community'].isin(['Plant-Pollinator', 'Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Herbivore']))]['subsample_ID'].unique()

# Run random forest for each subsample
for subsample in subsamples:
    
    print('Subsample:', subsample)

    res_noBias = transductive_learning(df[df['subsample_ID'] == subsample])
    res_highDegBias = transductive_learning(df_h[df_h['subsample_ID'] == subsample])
    res_lowDegBias = transductive_learning(df_l[df_l['subsample_ID'] == subsample])

    results_noBias = pd.concat([results_noBias, res_noBias], axis=0)
    results_highDegBias = pd.concat([results_highDegBias, res_highDegBias], axis=0)
    results_lowDegBias = pd.concat([results_lowDegBias, res_lowDegBias], axis=0)

In [None]:
results_noBias.to_csv(results_path+'/raw/results_transductiveML.csv', index=False)
results_highDegBias.to_csv(results_path+'/raw/results_transductiveML_highDegBias.csv', index=False)
results_lowDegBias.to_csv(results_path+'/raw/results_transductiveML_lowDegBias.csv', index=False)

### Processing network filtering data

In [None]:
# Load dataframe
meta_f, df_initial_f = load_data(
    path_meta = 'data/processed/networks/filtering/subsamples_filtered_metadata.csv',
    path_subsample = 'data/processed/networks/filtering/subsamples_filtered_edge_lists.csv',
    paths_features = ['data/processed/features/filtering/features_filtered_py.csv',
                      'data/processed/features/filtering/features_filtered_R.csv'],
)

df_f = df_initial_f.copy()

to_drop = ['flow_infomap_HL', 'flow_infomap_LL', 'modular_centrality_infomap_HL', 'modular_centrality_infomap_LL']#,'discrepancy.HL', 'discrepancy.LL']
df_f = df_f.drop(to_drop, axis=1)

df_f['Fisher alpha'] = df_f['Fisher alpha'].astype('float64')
df_f['class'] = df_f['class'].astype('int64')

df_f = df_f[[c for c in df_f if c not in ['class']] + ['class']]

In [None]:
X_f, y_f = LinkPredict().subset_data(df_f[(df_f['fraction']==0.8) & (df_f['class']!=1.0)], cast_target=True)

results_f_df = pd.DataFrame({
    'link_ID':X_f['link_ID'],
    'y_proba':ml.predict_proba(X_f),
})

results_f_df.to_csv(results_path+'/raw/results_filtered_networks.csv', index=False)

### Processing sensitivity data

In [None]:
# Load dataframe
meta_s, df_initial_s = load_data(
    path_meta = 'data/processed/networks/sensitivity/subsamples_sensitivity_metadata.csv',
    path_subsample = 'data/processed/networks/sensitivity/subsamples_sensitivity_edge_lists.csv',
    paths_features = ['data/processed/features/sensitivity/features_sensitivity_py.csv',
                      'data/processed/features/sensitivity/features_sensitivity_R.csv'],
)


df_s = df_initial_s.copy()

to_drop = ['flow_infomap_HL', 'flow_infomap_LL', 'modular_centrality_infomap_HL', 'modular_centrality_infomap_LL']#,'discrepancy.HL', 'discrepancy.LL']
df_s = df_s.drop(to_drop, axis=1)

df_s['Fisher alpha'] = df_s['Fisher alpha'].astype('float64')
df_s['class'] = df_s['class'].astype('int64')

df_s = df_s[[c for c in df_s if c not in ['class']] + ['class']]

In [None]:
frac_list = [0.7,0.75,0.8,0.85,0.9,0.95]
groups = ['Plant-Seed Dispersers', 'Host-Parasite', 'Plant-Pollinator', 'Plant-Herbivore']

trained_models_sensitivity=[]
cgc_list_sensitivity = []

for frac in frac_list:
  
    print('Fraction:', frac)

    # filter by fraction
    X, y = LinkPredict().subset_data(df_s[(df_s['fraction']==frac) | (df_s['fraction']==1.0)], cast_target=False)

    cgc_inner = CustomGroupCV(
        group_by='name', 
        stratify_by='community', 
        fractions_col = 'fraction',
        n_splits = 3,
        fractions_train = [1],
        fractions_test = [frac],
        groups_train = groups,
        groups_test = groups,
        drop_isolates=False,
        drop_existing_links = True
        )

    cgc_outer = CustomGroupCV(
        group_by='name', 
        stratify_by='community', 
        fractions_col = 'fraction',
        n_splits = 5,
        fractions_train = [1],
        fractions_test = [frac],
        groups_train = groups,
        groups_test = groups,
        drop_isolates=False,
        drop_existing_links = True,
        keep_fractions_train=True, # Keep all fractions in train, although they are in fractions_train (will needed later)
        keep_fractions_test=False
        )
    
    ml = LinkPredict('RandomForestClassifier')

    # Train
    ml.fit(X,
        y,
        cv_inner=cgc_inner,
        cv_outer=cgc_outer,
        scorer_metric='f1',
        class_weight='balanced',
        tuner_name='RandomizedSearchCV',
        columns_to_ignore=columns_ignore)

    # Save model
    trained_models_sensitivity.append(ml) 
    cgc_list_sensitivity.append(cgc_outer)

In [None]:
# Save results
results_sensitivity = pd.DataFrame()

for ml, cgc_outer in zip(trained_models_sensitivity, cgc_list_sensitivity):

    results_link_ids = cgc_outer.get_link_ids()[1]

    for fold in range(cgc_outer.get_n_splits()):
        
        test_links = results_link_ids[fold]
        X_test_subset = df_s[df_s['link_ID'].isin(test_links)]
        ml.set_fold_model(fold)
        
        y_proba = ml.predict_proba(X_test_subset)
        df_len = len(y_proba)
        results_fold_df = pd.DataFrame({
            'fold':[fold]*df_len, 
            'link_ID':X_test_subset['link_ID'],
            'frac_test':[cgc_outer.fractions_test[0]]*df_len,
            'y_proba':y_proba,
        })
        results_sensitivity = pd.concat([results_sensitivity, results_fold_df], axis=0)

In [None]:
results_sensitivity.to_csv(results_path+'/raw/results_sensitivity.csv', index=False)