### feature_extractor

In [1]:
import math

import networkx as nx
import numpy as np
import pandas as pd

from collections import defaultdict
from datetime import date
from tqdm import tqdm
from itertools import product

# distant_df = pd.read_csv('../data/us_air_distance.csv')
# distant_map = distant_df.set_index(['source_origin','target_origin']).to_dict()['distance']
# distant_map.update(distant_df.set_index(['target_origin','source_origin']).to_dict()['distance'])
# population_map = pd.read_csv('../data/us_air_population_all.csv').set_index('Unnamed: 0').fillna(0).to_dict()['0']

def get_gravitation(edges):
    def my_divid(a,b):
        if b==0 or a==0:
            return None
        else:
            return a/b
    Gra = []
    for e in edges:
        u, v = e
        d = distant_map.get(e , 0)
        ni = population_map.get(u, 0)
        nj = population_map.get(v, 0)
        Gra.append(my_divid(ni*nj, d**2))
    meanv = np.mean([i for i in Gra if i])
    return [i if i else meanv for i in Gra]

def features_extractor(graphs, dates):
    def local_path(G, nodeList, epsilon = 0.01):
        A = nx.adjacency_matrix(G, nodelist=nodeList, weight = None).todense()
        return (A**2+epsilon*A**3)

    def l3_path(G, nodeList):
        A = nx.adjacency_matrix(G, nodelist=nodeList, weight = None).todense()
        return (A**3)

    def weighted_local_path(G, nodeList, epsilon = 0.01):
        A = nx.adjacency_matrix(G, nodelist=nodeList, weight='weight').todense()
        return (A**2+epsilon*A**3)

    X = defaultdict(list)
    for i in tqdm(range(len(graphs)-1)):
        G, H = graphs[i], graphs[i+1]
        G.add_nodes_from([n for n in H if n not in G])
        H.add_nodes_from([n for n in G if n not in H])
        Hedges = set(H.edges())
        Gedges = list(G.edges())
        nodeList = list(G.nodes())
        nodeIndex = {node: idx for idx,node in enumerate(nodeList)}
        year = dates[i]

        Ki = dict(G.degree())
        Wi = dict(G.degree(weight='weight'))
        LPI = local_path(G, nodeList)
        L3 = l3_path(G, nodeList)
        WLPI = weighted_local_path(G, nodeList)
        Gra = get_gravitation(Gedges)

        added_edges = list(nx.difference(H,G).edges())

        for j, e in enumerate(Gedges):
            u, v = e
            common_ns = list(nx.common_neighbors(G,u,v))
            w_common_ns = sum([min(G[u][z]['weight'], G[v][z]['weight']) for z in common_ns])
            union_ns = set(G.neighbors(u))|set(G.neighbors(v))
            w_union_ns = Wi[u] + Wi[v]- w_common_ns
            if(w_union_ns==0): print(Wi[u] , Wi[v], [min(G[u][z]['weight'], G[v][z]['weight']) for z in common_ns])
            X['Edge'].append(e)
            X['Year'].append(year)

            X['Common Neighbor'].append(len(common_ns))
            X['Weighted Common Neighbor'].append(w_common_ns)

            X['Salton'].append(len(common_ns)/math.sqrt(Ki[u]*Ki[v]))
            X['Weighted Salton'].append(w_common_ns/math.sqrt(Wi[u]*Wi[v]))

            X['Sorensen'].append(2*len(common_ns)/(Ki[u]+Ki[v]))
            X['Weighted Sorensen'].append(2*w_common_ns/(Wi[u]+Wi[v]))

            X['Hub Promoted'].append(len(common_ns)/min(Ki[u],Ki[v]))
            X['Weighted Hub Promoted'].append(w_common_ns/min(Wi[u],Wi[v]))

            X['Hub Depressed'].append(len(common_ns)/max(Ki[u],Ki[v]))
            X['Weighted Hub Depressed'].append(w_common_ns/max(Wi[u],Wi[v]))

            X['Leicht Holme Newman'].append(len(common_ns)/(Ki[u]*Ki[v]))
            X['Weighted Leicht Holme Newman'].append(w_common_ns/(Wi[u]*Wi[v]))

            X['Preferential Attachment'].append(Ki[u]*Ki[v])
            X['Weighted Preferential Attachment'].append(Wi[u]*Wi[v])

            X['Local Path'].append(LPI[nodeIndex[u],nodeIndex[v]])
            X['L3 Path'].append(L3[nodeIndex[u],nodeIndex[v]])
            X['Weighted Local Path'].append(WLPI[nodeIndex[u],nodeIndex[v]])
            if len(common_ns)>0:
                X['Resource Allocation'].append(sum([1/Ki[z] for z in common_ns]))
                X['Weighted Resource Allocation'].append(w_common_ns*sum([1/Wi[z] for z in common_ns]))

                X['Adamic Adar'].append(sum([1/math.log(Ki[z]) for z in common_ns]))
                X['Weighted Adamic Adar'].append(w_common_ns*sum([1/math.log(Wi[z]+1) for z in common_ns]))

                X['Jaccard'].append(len(common_ns)/len(union_ns))
                X['Weighted Jaccard'].append(w_common_ns/w_union_ns)
            else:
                X['Resource Allocation'].append(0)
                X['Weighted Resource Allocation'].append(0)
                X['Adamic Adar'].append(0)
                X['Weighted Adamic Adar'].append(0)
                X['Jaccard'].append(0)
                X['Weighted Jaccard'].append(0)

            X['Removed'].append(e not in Hedges)
            X['Gravity'].append(Gra[j])
            X['Curr Weight'].append(G[u][v]['weight'])
            X['Next Weight'].append(H[u][v]['weight'] if e in Hedges else 0)

            X['Curr FWeight'].append(G[u][v]['weight']/G.size(weight='weight'))
            X['Next FWeight'].append(H[u][v]['weight']/H.size(weight='weight') if e in Hedges else 0)

    df = pd.DataFrame(X)
    return(df)

In [2]:
from glob import glob
from_file = 'data/raw_usair_data/*.csv'
glob(from_file)

['data/raw_usair_data/2004_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2005_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2006_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2007_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2008_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2009_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2010_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2011_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2012_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2013_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2014_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2015_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2016_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2017_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2018_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/2019_T_T100D_SEGMENT_ALL_CARRIER.csv',
 'data/raw_usair_data/20

### raw2features_usair

In [50]:
import re
import sys
import unicodedata

import pandas as pd
import networkx as nx
import numpy as np

from tqdm import tqdm
from datetime import date
from glob import glob
from collections import defaultdict

sys.path.append('src')
# from features_extractor import features_extractor

def strip_accents(text):
    """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError): # unicode is a default on python 3
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

def text_to_id(text):
    """
    Convert input text to id.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    text = strip_accents(text.lower())
    text = re.sub(r"\d", "", text)
    text=re.sub(r"^\s+", "", text)
    text=re.sub(r"\s+$", "", text)
    text = re.sub(r"\s+","_", text, flags = re.I)
    #text = re.sub('[ ]+', '_', text)
    text = re.sub('[^a-zA-Z_-]', '', text)
    return text

if __name__ == "__main__":
    from_file = 'data/raw_usair_data/*.csv'
    feature_file = 'data/features/usair_2004_2022.csv'

#     try:
#         data = pd.read_csv(to_file, sep=';')
#         data.set_index(['YEAR', 'MONTH'], inplace=True)
#     except:
#         print(f'{to_file} not found! Generating graphs from raw')
    dfs=[]
    for f in tqdm(sorted(glob(from_file))):
        df=pd.read_csv(f,engine="python",error_bad_lines=False)
        df=df[['YEAR','MONTH','ORIGIN_CITY_NAME','DEST_CITY_NAME','PASSENGERS','DEPARTURES_PERFORMED']]
        df=df.rename(index=str, columns={"ORIGIN_CITY_NAME": "source",
                                         "DEST_CITY_NAME": "target",
                                         'PASSENGERS':'passengers',
                                         'DEPARTURES_PERFORMED':'weight'})
        df['source']=df.apply(lambda row: text_to_id(str(row.source)), axis=1)
        df['target']=df.apply(lambda row: text_to_id(str(row.target)), axis=1)
        df=df.groupby(['YEAR','MONTH','source','target']).sum()
        df=df.reset_index()
        dfs.append(df[df.weight !=0 ])
    data=pd.concat(dfs, ignore_index=True)
    data=data.reset_index().drop(columns='index')
    data.set_index(['YEAR', 'MONTH'],inplace=True)
    data.sort_index(inplace=True)
    # data.to_csv(to_file,sep=';')

    data = data[data.source != data.target]
    data = data[data.weight!=0]
    
    print(data)
#     year = list(data.index.get_level_values(0).unique())
#     month = list(data.index.get_level_values(1).unique())
#     graphs_air = []
#     date_air = []
#     for y in year:
#         for m in month:
#             if y==2022 and m==12:
#                 break
#             df = data.loc[y,m]
#             date_air.append(date(y,m,1))
#             G = nx.from_pandas_edgelist(df, edge_attr=True)
#             graphs_air.append(G)
#     features = features_extractor(graphs_air, date_air)
#     features.to_csv(feature_file)



  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",error_bad_lines=False)


  df=pd.read_csv(f,engine="python",err

                   source            target  passengers  weight
YEAR MONTH                                                     
2004 1        aberdeen_sd      jamestown_nd        45.0    25.0
     1        aberdeen_sd    minneapolis_mn      2782.0   141.0
     1        aberdeen_sd         pierre_sd       505.0    52.0
     1        aberdeen_sd    sioux_falls_sd         0.0    35.0
     1        aberdeen_sd      watertown_sd       139.0    29.0
...                   ...               ...         ...     ...
2022 10           yuma_az         laredo_tx       590.0     4.0
     10           yuma_az        phoenix_az      6536.0   222.0
     10           yuma_az   port_hueneme_ca         0.0     1.0
     10     zachar_bay_ak      amook_bay_ak         0.0     1.0
     10     zachar_bay_ak  kodiak_island_ak         2.0     3.0

[2256649 rows x 4 columns]


#### Note *** Missing data -- 11/2022***

In [76]:
year = list(data.index.get_level_values(0).unique())
month = list(data.index.get_level_values(1).unique())
graphs_air = []
date_air = []
for y in year:
    for m in month:
        if y==2022 and m==10:
            break
        df = data.loc[y,m]
        date_air.append(date(y,m,1))
        G = nx.from_pandas_edgelist(df, edge_attr=True)
        graphs_air.append(G)
features = features_extractor(graphs_air, date_air)
features.to_csv(feature_file)

  A = nx.adjacency_matrix(G, nodelist=nodeList, weight = None).todense()
  A = nx.adjacency_matrix(G, nodelist=nodeList, weight = None).todense()
  A = nx.adjacency_matrix(G, nodelist=nodeList, weight='weight').todense()
100%|███████████████████████████████████████████████████████████████████████████████████| 224/224 [5:52:16<00:00, 94.36s/it]


### plot_style

In [5]:
pd.read_csv("data/features/usair_2004_2022.csv")

Unnamed: 0.1,Unnamed: 0,Edge,Year,Common Neighbor,Weighted Common Neighbor,Salton,Weighted Salton,Sorensen,Weighted Sorensen,Hub Promoted,...,Adamic Adar,Weighted Adamic Adar,Jaccard,Weighted Jaccard,Removed,Gravity,Curr Weight,Next Weight,Curr FWeight,Next FWeight
0,0,"('aberdeen_sd', 'jamestown_nd')",2004-01-01,1,26.0,0.258199,0.172219,0.250000,0.139410,0.333333,...,0.198025,2.646591,0.142857,0.074928,False,3.794004e+04,29.0,26.0,0.000068,0.000063
1,1,"('aberdeen_sd', 'minneapolis_mn')",2004-01-01,4,99.0,0.143223,0.042340,0.049689,0.010551,0.800000,...,3.190348,79.667360,0.025478,0.005303,False,1.511278e+05,137.0,135.0,0.000322,0.000328
2,2,"('aberdeen_sd', 'pierre_sd')",2004-01-01,2,31.0,0.338062,0.151742,0.333333,0.141876,0.400000,...,0.521541,7.715756,0.200000,0.076355,False,2.600904e+04,59.0,54.0,0.000139,0.000131
3,3,"('aberdeen_sd', 'sioux_falls_sd')",2004-01-01,2,166.0,0.190693,0.322516,0.148148,0.278757,0.400000,...,0.711924,50.393390,0.080000,0.161951,False,1.715192e+05,42.0,40.0,0.000099,0.000097
4,4,"('aberdeen_sd', 'watertown_sd')",2004-01-01,1,53.0,0.316228,0.340191,0.285714,0.280423,0.500000,...,0.198025,5.394975,0.166667,0.163077,False,1.081186e+05,29.0,29.0,0.000068,0.000070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308971,1308971,"('west_point_ak', 'uganik_ak')",2022-08-01,0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,True,1.010896e+07,3.0,0.0,0.000008,0.000000
1308972,1308972,"('santa_maria_ca', 'santa_ynez_ca')",2022-08-01,1,1.0,0.288675,0.075810,0.250000,0.022472,0.500000,...,0.236177,0.121455,0.142857,0.011364,True,9.942292e+08,1.0,0.0,0.000003,0.000000
1308973,1308973,"('monroe_nc', 'selinsgrove_pa')",2022-08-01,0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,True,1.010896e+07,1.0,0.0,0.000003,0.000000
1308974,1308974,"('st_michael_ak', 'stebbins_ak')",2022-08-01,3,64.0,0.408248,0.476288,0.400000,0.474074,0.500000,...,0.921257,30.367745,0.250000,0.310680,True,7.013384e+05,25.0,0.0,0.000068,0.000000


In [6]:
import seaborn as sns
import matplotlib.mlab as ml
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.ticker as plticker

def stdfigsize(scale=1, nx=1, ny=1, ratio=1.3):
    """
    Returns a tuple to be used as figure size.
    -------
    returns (7*ratio*scale*nx, 7.*scale*ny)
    By default: ratio=1.3
    If ratio<0 them ratio = golden ratio
    """
    if ratio < 0:
        ratio = 1.61803398875
    return((7*ratio*scale*nx, 7*scale*ny))

def stdrcparams(usetex=False):
    """
    Set several mpl.rcParams and sns.set_style for my taste.
    ----
    usetex = True
    ----
    """
    sns.set_style("white")
    sns.set_style({"xtick.direction": "in",
                 "ytick.direction": "in"})
    rcparams = {'text.usetex': usetex,
              'font.family': 'sans-serif',
              'font.sans-serif': ['Helvetica'],
             # 'text.latex.unicode': True,
              'text.latex.preamble': [r"\usepackage[T1]{fontenc}",
                                      r"\usepackage{lmodern}",
                                      r"\usepackage{amsmath}",
                                      r"\usepackage{mathptmx}"
                                      ],
              'axes.labelsize': 30,
              'axes.titlesize': 30,
              'ytick.right': 'on',
              'xtick.top': 'on',
              'xtick.labelsize': '25',
              'ytick.labelsize': '25',
              'axes.linewidth': 1.8,
              'xtick.major.width': 1.8,
              'xtick.minor.width': 1.8,
              'xtick.major.size': 14,
              'xtick.minor.size': 7,
              'xtick.major.pad': 10,
              'xtick.minor.pad': 10,
              'ytick.major.width': 1.8,
              'ytick.minor.width': 1.8,
              'ytick.major.size': 14,
              'ytick.minor.size': 7,
              'ytick.major.pad': 10,
              'ytick.minor.pad': 10,
              'axes.labelpad': 15,
              'axes.titlepad': 15,
              "xtick.direction": "in",
              "ytick.direction": "in",
              'legend.fontsize': 20}
    mpl.rcParams.update(rcparams)

mpl.rcParams['lines.linewidth'] = 5
mpl.rcParams['lines.color'] = '#3690c0'

stdrcparams(usetex=True)
figsize=stdfigsize(ratio=-1)
xs,ys=figsize

def custom_frame(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.tick_params(axis='x',length=10,direction='out')
    ax.tick_params(axis='x',which='minor',direction='out')
    ax.tick_params(axis='y',length=10,direction='out')
    ax.tick_params(axis='y',which='minor',direction='out')

### classification

In [98]:
import sys
import random
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import scipy.stats as ss

from datetime import date
from tqdm import tqdm
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,balanced_accuracy_score, mean_squared_error,r2_score,mean_absolute_error

sys.path.append('src')
# from plot_style import *

def get_edge_slice(data, f_train_e=0.7, seed=30):
    df = data
    edges = set(df.Edge.unique())
    random.seed(seed)
    edge_train = set(random.sample(edges, int(f_train_e*len(edges))))
    edge_test = set([e for e in edges if e not in edge_train])
    df_se = df.loc[df['Edge'].isin(edge_train)].drop(columns = ['Edge'])
    df_de = df.loc[df['Edge'].isin(edge_test)].drop(columns = ['Edge'])
    return(df_se, df_de)

def df_to_XY(df, features, target='Removed'):
    if 'Year' in df.columns:
        df = df.drop(columns = ['Year'])
    if "Edge" in df.columns:
        df = df.drop(columns = ['Edge'])
    X = df.loc[:, features].to_numpy()
    y = df.loc[:, df.columns == target].to_numpy()
    return(X, y)

def simultaneous_test(df_se, df_de, features, best_params, save = True, name = None):
    if name is None:
        name = ''.join([w[0] for w in features]) + '_simultaneous'
    else:
        name =  name + '_simultaneous'
    year_list = list(df_se.Year.unique())
    res_df_de = df_de.copy()
    res_df_de['simultaneous_pred']= np.nan
    res_df_de['simultaneous_null']= np.nan
    for year in tqdm(year_list):
        X_train,y_train = df_to_XY(df_se[df_se.Year==year],features)
        ros = RandomUnderSampler()
        X_train,y_train = ros.fit_resample(X_train,y_train)
        X_test,y_test = df_to_XY(df_de[df_de.Year==year],features)
        y_train_null = y_train.copy()
        np.random.shuffle(y_train_null)
        model = XGBClassifier(**best_params)
        model.fit(X_train, y_train)
        model_null = XGBClassifier(**best_params)
        model_null.fit(X_train, y_train_null)
        y_pred = model.predict(X_test)
        y_pred_null = model_null.predict(X_test)
        res_df_de.loc[res_df_de.Year==year, 'simultaneous_pred'] = y_pred
        res_df_de.loc[res_df_de.Year==year, 'simultaneous_null'] = y_pred_null
    if save:
        res_df_de.to_csv('data\\results\\'+name+'.csv')
    return res_df_de

def nonsimultaneous_test(df_train, df_test, features, best_params, save=True, name = None):
    if name is None:
        name =  ''.join([w[0] for w in features]) + '_nonsimultaneous'
    else:
        name = name + '_nonsimultaneous'
    year_list = list(df_test.Year.unique())
    preds = []
    for year_train in tqdm(year_list):
        X_train,y_train = df_to_XY(df_train[df_train.Year==year_train],features)
        ros = RandomUnderSampler()
        X_train,y_train = ros.fit_resample(X_train,y_train)
        y_train_null = y_train.copy()
        np.random.shuffle(y_train_null)
        model = XGBClassifier(**best_params)
        model.fit(X_train, y_train)
        model_null = XGBClassifier(**best_params)
        model_null.fit(X_train, y_train_null)
        i = year_list.index(year_train)
        for year_test in year_list[i:]:
            X_test, y_test = df_to_XY(df_test[df_test.Year==year_test],features)
            y_pred = model.predict(X_test)
            y_null = model_null.predict(X_test)
            preds.append([year_train ,year_test, y_test,y_pred,y_null])
    if save:
        import pickle
        with open('data\\results\\'+name+'.pkl', 'wb') as f:
            pickle.dump(preds, f)
    return preds

def all_shap_values(df1, df2, features, best_params, save=True, name = None):
    import shap
    if name is None:
        name =  ''.join([w[0] for w in features]) + '_SHAP'
    else:
        name =  name + '_SHAP'
    def get_temporal_order(shap_list):
        importance_array = []
        for shap_values in shap_list:
            array = -np.abs(shap_values).mean(axis=0)
            ranks = ss.rankdata(array)
            importance_array.append(ranks)
        return(np.array(importance_array))

    shap_values_list = []
    test_list = []
    year_list = []
    for i in tqdm(df2.Year.unique()):
        X_train,y_train = df_to_XY(df1[ df1.Year == i ].drop(columns = ['Year']),features)
        ros = RandomUnderSampler()
        X_train,y_train = ros.fit_resample(X_train,y_train)
        X_test,y_test = df_to_XY(df2[ df2.Year == i ].drop(columns = ['Year']), features)
        model = XGBClassifier(**best_params)
        model.fit(X_train, y_train)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        year_list.append(i)
        test_list.append(pd.DataFrame(X_test,columns=features))
        shap_values_list.append(shap_values)
    if save:
        import pickle
        with open('data\\results\\'+name+'.pkl', 'wb') as f:
            pickle.dump((test_list, year_list, shap_values_list), f)
    return (test_list, year_list, shap_values_list)

def BTF(train, test):
    name = 'Air_Classification_BTF'
    features = ['Common Neighbor', 'Salton', 'Jaccard', 'Sorensen', 'Hub Promoted',
               'Hub Depressed', 'Leicht Holme Newman', 'Preferential Attachment',
               'Adamic Adar', 'Resource Allocation', 'Local Path']
    simultaneous_test(train, test, features, best_params, name = name)
    nonsimultaneous_test(train, test, features, best_params, name = name)
    all_shap_values(train, test, features, best_params, name = name)

def WTF(train, test):
    name = 'Air_Classification_WTF'
    features = []
    for c in data.columns:
        if  'Weighted' in c:
            features.append(c)
    simultaneous_test(train, test, features, best_params,name = name)
    nonsimultaneous_test(train, test, features, best_params, name = name)
    all_shap_values(train, test, features, best_params, name = name)

def WWW(train, test):
    name = 'Air_Classification_WWW'
    features = ['Curr FWeight']
    simultaneous_test(train, test, features, best_params, name = name)
    nonsimultaneous_test(train, test, features, best_params, name = name)
    all_shap_values(train, test, features, best_params, name = name)

def BTFW(train, test):
    name = 'Air_Classification_BTFW'
    features = ['Common Neighbor', 'Salton', 'Jaccard', 'Sorensen', 'Hub Promoted',
               'Hub Depressed', 'Leicht Holme Newman', 'Preferential Attachment',
               'Adamic Adar', 'Resource Allocation', 'Local Path','Curr FWeight']

    simultaneous_test(train, test, features, best_params, name = name)
    nonsimultaneous_test(train, test, features, best_params, name = name)
    all_shap_values(train, test, features, best_params, name = name)

if __name__ == "__main__":
    # to run classification on bus change the data
    # data =

    global best_params
    best_params = None
    for data_path in ["data/features/usair_2004_2022.csv"]:
        data = pd.read_csv(data_path)
        train, test = get_edge_slice(data)
        if best_params is None:
            best_params = {}
        BTF(train, test)
        WTF(train, test)
        WWW(train, test)
        BTFW(train, test)

since Python 3.9 and will be removed in a subsequent version.
  edge_train = set(random.sample(edges, int(f_train_e*len(edges))))
100%|█████████████████████████████████████████████████████████████████████████████████████| 224/224 [04:12<00:00,  1.13s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████| 224/224 [26:08<00:00,  7.00s/it]


ModuleNotFoundError: No module named 'shap'

### prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

import random
import math
import datetime

from datetime import date
from collections import defaultdict
from tqdm import tqdm
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import confusion_matrix,balanced_accuracy_score, \
mean_squared_error,r2_score,mean_absolute_error

import sys
sys.path.append("src/")
from plot_style import *

def get_feature_vector(graph):
    def local_path(G, nodeList, epsilon = 0.01):
        A = nx.adjacency_matrix(G, nodelist=nodeList, weight = None).todense()
        return (A**2+epsilon*A**3)

    def weighted_local_path(G, nodeList, epsilon = 0.01):
        A = nx.adjacency_matrix(G, nodelist=nodeList, weight='weight').todense()
        return (A**2+epsilon*A**3)
    X = defaultdict(list)
    G = graph
    Gedges = list(G.edges())
    nodeList = list(G.nodes())
    nodeIndex = {node: idx for idx,node in enumerate(nodeList)}

    Ki = dict(G.degree())
    Wi = dict(G.degree(weight='weight'))
    LPI = local_path(G, nodeList)
    WLPI = weighted_local_path(G, nodeList)
    for j, e in enumerate(Gedges):
        u, v = e
        common_ns = list(nx.common_neighbors(G,u,v))
        w_common_ns = sum([min(G[u][z]['weight'], G[v][z]['weight']) for z in common_ns])
        union_ns = set(G.neighbors(u))|set(G.neighbors(v))
        w_union_ns = Wi[u] + Wi[v]- w_common_ns
        if(w_union_ns==0): print(Wi[u] , Wi[v], [min(G[u][z]['weight'], G[v][z]['weight']) for z in common_ns])
        X['Edge'].append(e)

        X['Common Neighbor'].append(len(common_ns))
        X['Weighted Common Neighbor'].append(w_common_ns)

        X['Salton'].append(len(common_ns)/math.sqrt(Ki[u]*Ki[v]))
        X['Weighted Salton'].append(w_common_ns/math.sqrt(Wi[u]*Wi[v]))

        X['Sorensen'].append(2*len(common_ns)/(Ki[u]+Ki[v]))
        X['Weighted Sorensen'].append(2*w_common_ns/(Wi[u]+Wi[v]))

        X['Hub Promoted'].append(len(common_ns)/min(Ki[u],Ki[v]))
        X['Weighted Hub Promoted'].append(w_common_ns/min(Wi[u],Wi[v]))

        X['Hub Depressed'].append(len(common_ns)/max(Ki[u],Ki[v]))
        X['Weighted Hub Depressed'].append(w_common_ns/max(Wi[u],Wi[v]))

        X['Leicht Holme Newman'].append(len(common_ns)/(Ki[u]*Ki[v]))
        X['Weighted Leicht Holme Newman'].append(w_common_ns/(Wi[u]*Wi[v]))

        X['Preferential Attachment'].append(Ki[u]*Ki[v])
        X['Weighted Preferential Attachment'].append(Wi[u]*Wi[v])

        X['Local Path'].append(LPI[nodeIndex[u],nodeIndex[v]])
        X['Weighted Local Path'].append(WLPI[nodeIndex[u],nodeIndex[v]])
        if len(common_ns)>0:
            X['Resource Allocation'].append(sum([1/Ki[z] for z in common_ns]))
            X['Weighted Resource Allocation'].append(w_common_ns*sum([1/Wi[z] for z in common_ns]))

            X['Adamic Adar'].append(sum([1/math.log(Ki[z]) for z in common_ns]))
            X['Weighted Adamic Adar'].append(w_common_ns*sum([1/math.log(Wi[z]+1) for z in common_ns]))

            X['Jaccard'].append(len(common_ns)/len(union_ns))
            X['Weighted Jaccard'].append(w_common_ns/w_union_ns)
        else:
            X['Resource Allocation'].append(0)
            X['Weighted Resource Allocation'].append(0)
            X['Adamic Adar'].append(0)
            X['Weighted Adamic Adar'].append(0)
            X['Jaccard'].append(0)
            X['Weighted Jaccard'].append(0)

        X['Curr Weight'].append(G[u][v]['weight'])
        X['Curr FWeight'].append(G[u][v]['weight']/G.size(weight='weight'))
    df = pd.DataFrame(X)
    return(df)

def get_edge_slice(data, f_train_e=0.7, seed=30):
    df = data
    edges = set(df.Edge.unique())
    random.seed(seed)
    edge_train = set(random.sample(edges, int(f_train_e*len(edges))))
    edge_test = set([e for e in edges if e not in edge_train])
    df_se = df.loc[df['Edge'].isin(edge_train)].drop(columns = ['Edge'])
    df_de = df.loc[df['Edge'].isin(edge_test)].drop(columns = ['Edge'])
    return(df_se, df_de)

def df_to_XY(df, features, target='Removed'):
    if 'Year' in df.columns:
        df = df.drop(columns = ['Year'])
    if "Edge" in df.columns:
        df = df.drop(columns = ['Edge'])
    X = df.loc[:, features].to_numpy()
    y = df.loc[:, df.columns == target].to_numpy()
    return(X, y, df.loc[:, df.columns == 'Next Weight'].to_numpy())

def add_edges(graphs, inp_graph, time_idx):
    GI0,GI1 = graphs[time_idx+1],graphs[time_idx]
    GI0.add_nodes_from([n for n in GI1 if n not in GI0])
    GI1.add_nodes_from([n for n in GI0 if n not in GI1])
    added_edges = list(nx.difference(GI0,GI1).edges())
    return added_edges

def main(year_start):
    if year_start.month==1: return
    data = pd.read_csv('../data/networks/US_Air_2004_2022.csv', sep=';')
    data.set_index(['YEAR', 'MONTH'], inplace=True)
    data = data[data.source != data.target]
    nodes = set(data.source) & set(data.target)
    data = data[data.weight!=0]
    year = list(data.index.get_level_values(0).unique())
    month = list(data.index.get_level_values(1).unique())
    graphs_air = []
    air_dates = []
    for y in year:
        for m in month:
            if y==2022 and m==10:
                break
            df = data.loc[y,m]
            air_dates.append(date(y,m,1))
            G = nx.from_pandas_edgelist(df, edge_attr=True)
            G.add_nodes_from(nodes)
            graphs_air.append(G)

    data = pd.read_csv('data/features/US_Air_2004_2022.csv')
    out = {}
    idx = air_dates.index(year_start)
    train, test = get_edge_slice(data)
    features = ['Common Neighbor', 'Salton', 'Jaccard', 'Sorensen', 'Hub Promoted',
           'Hub Depressed', 'Leicht Holme Newman', 'Preferential Attachment',
           'Adamic Adar', 'Resource Allocation', 'Local Path']
    X_train, y_train, _ = df_to_XY(train[train.Year==str(year_start)],features)
    ros = RandomUnderSampler()
    X_train,y_train = ros.fit_resample(X_train,y_train)
    model_btf = XGBClassifier()
    model_btf.fit(X_train, y_train)
    model = model_btf
    diff_btf = []
    graphs_btf = [graphs_air[idx]]
    for i in tqdm(range(0,36)):
        G = graphs_btf[i].copy()
        df = get_feature_vector(G)
        edges, X = df['Edge'].to_numpy(),df[features].to_numpy()

        GI0,GI1 = graphs_air[idx+i],graphs_air[idx+i+1]
        GI0.add_nodes_from([n for n in GI1 if n not in GI0])
        GI1.add_nodes_from([n for n in GI0 if n not in GI1])
        real_removal = set(nx.difference(GI0,GI1).edges())

        pred_prob = model.predict_proba(X).T[0]
        added_edges = add_edges(graphs_air, G, idx+i)
        N_add = len(added_edges)
        for u,v in added_edges:
            G.add_edge(u, v, weight=graphs_air[idx+i+1][u][v]['weight'])
        N_remove = G.number_of_edges() - graphs_air[idx+i+1].number_of_edges()

        removal = zip(edges,pred_prob)
        removal = sorted(removal, key = lambda x: x[1])[0:N_remove]
        remove_edges = [i for i,_ in removal]
        diff_btf.append(len(set(remove_edges)&real_removal)/N_remove)
        G.remove_edges_from(remove_edges)
        graphs_btf.append(G.copy())
    best_params = {'lambda': 0.5650701862593042, 'alpha': 0.0016650896783581535,
           'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.009,
           'n_estimators': 625, 'objective':'reg:squarederror','max_depth': 5, 'min_child_weight': 6}
    features = ['Curr Weight']
    X_train, y_train, y_reg = df_to_XY(train[train.Year==str(year_start)],features)
    ros = RandomUnderSampler()
    X_resample, y_resample = ros.fit_resample(X_train,y_train)
    model_www = XGBClassifier()
    model_www.fit(X_resample, y_resample)
    model_reg = XGBRegressor(**best_params)
    model_reg.fit(X_train, y_reg)
    model = model_www
    diff_www = []
    graphs_www = [graphs_air[idx]]
    for i in tqdm(range(0,36)):
        G = graphs_www[i].copy()
        for u,v in G.edges():
            G[u][v]['weight'] = model_reg.predict([G[u][v]['weight']])[0]
        df = get_feature_vector(G)
        edges, X = df['Edge'].to_numpy(),df[features].to_numpy()
        pred_prob = model.predict_proba(X).T[0]
        GI0,GI1 = graphs_air[idx+i],graphs_air[idx+i+1]
        GI0.add_nodes_from([n for n in GI1 if n not in GI0])
        GI1.add_nodes_from([n for n in GI0 if n not in GI1])
        real_removal = set(nx.difference(GI0,GI1).edges())
        added_edges = add_edges(graphs_air, G, idx+i)
        N_add = len(added_edges)
        for u,v in added_edges:
            G.add_edge(u, v, weight=graphs_air[idx+i+1][u][v]['weight'])
        N_remove = G.number_of_edges() - graphs_air[idx+i+1].number_of_edges()
        removal = zip(edges,pred_prob)
        removal = sorted(removal, key = lambda x: x[1])[0:N_remove]
        remove_edges = [i for i,_ in removal]
        diff_www.append(len(set(remove_edges)&real_removal)/N_remove)
        G.remove_edges_from(remove_edges)
        graphs_www.append(G.copy())
    btf_diff = []
    www_diff = []
    for i in range(0,36):
        G = set(graphs_air[idx+i].edges())
        H = set(graphs_btf[i].edges())
        btf_diff.append(len(G & H)/len(G))
        H = set(graphs_www[i].edges())
        www_diff.append(len(G & H)/len(G))
    out[year_start]=(diff_btf,diff_www,btf_diff,www_diff)
    import pickle
    with open(f'data\\results\\'+ f'{str(year_start)}pred_36' +'.pkl', 'wb') as f:
        pickle.dump(out, f)


    out = {}
    idx = air_dates.index(year_start)
    diff_null = []
    graphs_null = [graphs_air[idx]]
    for i in tqdm(range(0,36)):
        G = graphs_null[i].copy()
        edges = list(G.edges())
        GI0,GI1 = graphs_air[idx+i],graphs_air[idx+i+1]
        GI0.add_nodes_from([n for n in GI1 if n not in GI0])
        GI1.add_nodes_from([n for n in GI0 if n not in GI1])
        real_removal = set(nx.difference(GI0,GI1).edges())
        added_edges = add_edges(graphs_air, G, idx+i)
        N_add = len(added_edges)
        for u,v in added_edges:
            G.add_edge(u, v, weight=graphs_air[idx+i+1][u][v]['weight'])
        N_remove = G.number_of_edges() - graphs_air[idx+i+1].number_of_edges()
        remove_edges = random.sample(edges, N_remove)
        diff_null.append(len(set(remove_edges)&real_removal)/N_remove)
        G.remove_edges_from(remove_edges)
        graphs_null.append(G.copy())
    null_diff = []
    for i in range(0,36):
        G = set(graphs_air[idx+i].edges())
        H = set(graphs_null[i].edges())
        btf_diff.append(len(G & H)/len(G))
    out[year_start]=(diff_null,null_diff)
    import pickle
    with open(f'data\\results\\'+ f'{str(year_start)}pred_36_null' +'.pkl', 'wb') as f:
        pickle.dump(out, f)

if __name__ == '__main__':
    data = pd.read_csv('data/networks/US_Air_2004_202.csv', sep=';')
    data.set_index(['YEAR', 'MONTH'], inplace=True)
    data = data[data.source != data.target]
    nodes = set(data.source) & set(data.target)
    data = data[data.weight!=0]
    year = list(data.index.get_level_values(0).unique())
    month = list(data.index.get_level_values(1).unique())
    graphs_air = []
    air_dates = []
    for y in year:
        for m in month:
            if y==2022 and m==10:
                break
            df = data.loc[y,m]
            air_dates.append(date(y,m,1))
            G = nx.from_pandas_edgelist(df, edge_attr=True)
            G.add_nodes_from(nodes)
            graphs_air.append(G)

    from joblib import Parallel, delayed
    import multiprocessing
    num_cores = multiprocessing.cpu_count()
    results = Parallel(n_jobs=num_cores)(delayed(main)(year_start) for year_start in air_dates[:-36])