Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import grangercausalitytests
import networkx as nx
import matplotlib.pyplot as plt

# !pip install PyIF
# !pip install nose 
# !pip install numba
from PyIF import te_compute as te
from tabulate import tabulate

Base functions

In [None]:
def format_COVID_DataBase(_db, _group): 
    # Query the database to get the only a specific group of data    
    filtered_db = _db.query("demographic_category == 'Age Group' and demographic_value == @_group")


    # drop columns that are not needed
    filtered_db = filtered_db.drop(['demographic_category', 'demographic_value', 'percent_of_ca_population', 'report_date'], axis=1)

    return filtered_db

Create Visual Graph

In [None]:
def generate_graph_from_dataframe(data):
  graph = nx.DiGraph()
  data_values = data.values
  
  for row in range(0,len(data_values)): 
    for column in range(0,len(data_values)):
      if data_values[row][column]: 
        graph.add_edge(row,column)
  return graph

def generate_graph_from_matrix(matrix):
    gc_pvalues = pd.DataFrame(matrix)

    threshold = 0.01
    gc_boolean = gc_pvalues.applymap(lambda x: int(x<threshold))

    graph = nx.DiGraph()

    gen_graph = generate_graph_from_dataframe(gc_boolean)
    nx.draw_circular(gen_graph, with_labels=True)

In [None]:
def generate_and_plot_graph_from_matrix(_matrix, _threshold, _title, _save=False, _filename='graph.png'):
  plt.figure(figsize=(10,8))
  _graph = nx.DiGraph()
  pos = nx.spring_layout(_graph)

  edges = []
  for row in range(0,_matrix.shape[0]): 
    for column in range(0,_matrix.shape[1]):
      if ((row != column) & (_matrix[row][column] > _threshold)):
        #print(f'({column},{row}) = {round(_matrix[row][column],5)}')
        edges.append((column,row,round(_matrix[row][column],5)))
        _graph.add_edge(column, row, weight=round(_matrix[row][column],3))
        #_graph.add_edge(column, row, color=column, width=(10*_matrix[row][column]), label=_matrix[row][column])
  
  pos = nx.circular_layout(_graph)
  nx.draw(_graph, pos, with_labels=True, font_weight='bold')
  edge_weight = nx.get_edge_attributes(_graph,'weight')
  nx.draw_networkx_edge_labels(_graph, pos, label_pos=0.2, edge_labels=edge_weight)
  plt.title(_title)
  if _save:
    plt.savefig(_filename, format="png", dpi=300, bbox_inches='tight')
  plt.show()

  #print(f'\nArestas (threshold = {_threshold}):\n {edges}')

  return _graph

Granger Causality

In [None]:
#find the minimum pvalue in Granger Causality test result dictionary
def find_min_pvalue(gc_result_dict):
  min_pvalue = 1.0
  for lag_key, lag_result in gc_result_dict.items():
    #print(f'Lag: {lag_key}')    
    
    for test_key, test_result in lag_result[0].items():
      if test_key != "params_ftest":
        pvalue =  test_result[1]
        if pvalue < min_pvalue:
          min_pvalue = pvalue
      
  #print('\n'f"final p-value: {min_pvalue}")
  return min_pvalue


def create_granger_matrix(dataset, max_lags=3):
  matrix = []
  noColumns = len(dataset.columns)
  for i in range(0,noColumns):
    row = []
    for j in range(0,noColumns):
      #print('\n=======\n'f'Checking if column {j} granger causes column {i}')
      #calculate granger causality results
      gc_ji_result = grangercausalitytests(dataset.iloc[:,[j,i]].values,max_lags, verbose=False)
      #get p-value
      #pvalue = gc_result[3][0]['lrtest'][1]
      pvalue = find_min_pvalue(gc_ji_result)
      #print (f"pvalue: {pvalue}")
      #append pvalue in row
      row.append(pvalue)
    #append row in matrix
    matrix.append(row)
  #return matrix
  return matrix

Transfer Entropy

In [None]:
def create_transfer_entropy_matrix(dataset):
  matrix = []
  noColumns = len(dataset.columns)
  for i in range(0,noColumns):
    row = []
    for j in range(0,noColumns):
      #print(f'Checking transfer entropy of column {j} towards column {i} \n')
      #calculate granger causality results
      te_ji_result = te.te_compute(dataset.iloc[:,j].to_numpy(),dataset.iloc[:,i].to_numpy())
      
      #append pvalue in row
      row.append(te_ji_result)

    #append row in matrix
    matrix.append(row)
    
  #return matrix
  return matrix

Machine Learning Method

In [None]:
def format_dataBase(_base, _causal, _lag):
    # Create a new dataframe with with all zeros
    # The new dataframe will have the same number of rows as the base dataframe minus Lag
    _newDB = np.zeros((_base.shape[0] - _lag, _lag *2))
    _Y = np.zeros((_base.shape[0] - _lag))
    
    # for every new row in the new dataframe
    for i in range(0,_base.shape[0] - _lag):
        # copy start to lag from the base dataframe
        _newDB[i,0:_lag] = _base.iloc[i:i+_lag]
        # copy lag to lag*2 causal from the causal dataframe
        _newDB[i,_lag:_lag*2] = _causal.iloc[i:i+_lag]
        # copy lag*2+1 from the base dataframe to last column value witch is the value to predict
        _Y[i] = _base.iloc[i+_lag]

    return _newDB, _Y


# Code source: Jaques Grobler
# License: BSD 3 clause
def run_linear_regression_model(_X, _Y):
    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(_X, _Y)

    # Make predictions using the testing set
    _y_pred = regr.predict(_X)

    # Compute the error
    _error =  _Y -_y_pred

    return _error



def causality_regression_model(dataset, _lag = 3):
    matrix = np.zeros((dataset.shape[1],dataset.shape[1]))
    #noColumns = len(dataset.columns)
    # For every series in the train database
    for bSeries in range(0, dataset.shape[1]):

        # Create the base data with no casual influence
        new_db, _y = format_dataBase(dataset[bSeries],  dataset[bSeries]*0, _lag)
        
        # Running the linear regression model on base data
        base_error =  run_linear_regression_model(_X = new_db, _Y = _y)

        # For every other series
        for cSeries in range(0, dataset.shape[1]):
            # Skip if the series is the same
            if bSeries == cSeries:
                continue

            # Create the base data with causal influence
            new_db, _y = format_dataBase(dataset[bSeries],  dataset[cSeries], _lag)

            # Running the linear regression model on base data.
            causal_error = run_linear_regression_model(_X = new_db, _Y = _y)

            # Compute the error
            error = (float (np.log(np.var(base_error)/ np.var(causal_error))) )
            matrix[bSeries][cSeries] = error

    return matrix

Creating Table for comparison between causal results

In [None]:
def create_comparison_database(_matrix_gran, _matrix_te, _matrix_ml):
    _matrix = [{'index':'index','Granger': 'Granger','Transfer Entropy': 'Transfer Entropy','ML Regression': 'ML Regression'}]
    for i in range(0, len(_matrix_gran)):
        for j in range(0, len(_matrix_gran[i])):
            _matrix += [{'index': f'{i} - {j}',
                         'Granger': round(_matrix_gran[i][j], 5), 
                         'Transfer Entropy':  round(_matrix_te[i][j], 5), 
                         'ML Regression':  round(_matrix_ml[i][j], 5)}]

    table = tabulate(_matrix, headers='firstrow', tablefmt='fancy_grid')
    return table
            

Main Program

In [None]:
# INITIAL CONFIGURATION
in_db = pd.read_csv('demographics.csv')     # Input database
age_group = ['0-17', '18-49', '50-64', '65+'] 
AG_index = 0                                # Age group index (range 0-3)
save_figs = False                           # True to save the figures


# Format the database
formated_db = format_COVID_DataBase(in_db, age_group[AG_index])


# Prepare the train and test database
scaler = MinMaxScaler((-1,1))

# Fit database to scaler
scaler.fit(formated_db)

# Now apply the transformations to the data:
formated_db = scaler.transform(formated_db)


# GRANGER CAUSALITY MATRIX
theMatrix_gran = create_granger_matrix(pd.DataFrame(formated_db), max_lags=3)
#gen_graph_causality = generate_and_plot_graph_from_matrix(pd.DataFrame(theMatrix), 0, f'Granger Causality, age group ({age_group[AG_index]})',
#                                                          save_figs, _filename=f'graphs/causality_graphs/{age_group[AG_index]}_granger.png')


# TRANSFER ENTROPY MATRIX
theMatrix_te = create_transfer_entropy_matrix(pd.DataFrame(formated_db))
#gen_graph_te = generate_and_plot_graph_from_matrix(pd.DataFrame(theMatrix), 0, f'Transfer Entropy, age group ({age_group[AG_index]})',
#                                                   save_figs, _filename=f'graphs/causality_graphs/{age_group[AG_index]}_te.png')


# ML MODEL MATRIX
theMatrix_ml = causality_regression_model(pd.DataFrame(formated_db))
#gen_graph_ml = generate_and_plot_graph_from_matrix(pd.DataFrame(theMatrix), 0, f'ML Model, age group ({age_group[AG_index]})',
#                                                   save_figs, _filename=f'graphs/causality_graphs/{age_group[AG_index]}_ml.png')

# COMPARISON MATRIX
comparison_db = create_comparison_database(theMatrix_gran, theMatrix_te, theMatrix_ml)
print(comparison_db)
with open(f'{age_group[AG_index]}_comparison.txt', 'w', encoding="utf-8") as f:
  f.write(comparison_db)