# Installers



In [None]:
!pip install pandas numpy matplotlib statsmodels scipy pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.4


# Imports

In [None]:
# Visualization
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.patches import Circle, Patch
from matplotlib.lines import Line2D
from matplotlib.collections import PatchCollection
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
from IPython.display import display

# Data handling
import pandas as pd
import pmdarima as pm
from pandas.io.json import json_normalize
import numpy as np
import time
from google.colab import drive
import re
from pathlib import Path

# Models
import statsmodels.api as sm
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import statsmodels.tsa.stattools as smt
from pmdarima.model_selection import train_test_split

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

Determines the significance level of a correlation value.

In [None]:
def significance_level(value, se):
    """
    Args:
    value (float): The correlation value.
    se (float): The standard error.

    Returns:
    str: Significance level notation.
    """
    if abs(value) > 2 * se:
        return "***"  # 99% confidence
    elif abs(value) > 1.96 * se:
        return "**"  # 95% confidence
    elif abs(value) > 1.645 * se:
        return "*"  # 90% confidence
    else:
        return ""

Computes Pearson correlation and cross-correlation for two time series.

In [None]:
def compute_correlations(series1, series2, max_lag=12):
    """
    Args:
    series1 (pd.Series): The first time series.
    series2 (pd.Series): The second time series.
    max_lag (int): The maximum lag for cross-correlation analysis.

    Returns:
    dict: A dictionary containing Pearson correlation and cross-correlations for specified lags.
    """
    # Merge and align the series
    aligned_df = pd.merge(series1, series2, left_index=True, right_index=True, how='inner')
    aligned_df.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

    # Cross-Correlation
    ccf_values = sm.tsa.stattools.ccf(aligned_df.iloc[:, 0], aligned_df.iloc[:, 1], adjusted=True)[:max_lag + 1]
    se = 1 / np.sqrt(len(aligned_df))
    ccf_significance = [significance_level(corr, se) for corr in ccf_values]

    # Prepare results for DataFrame
    lag_labels = [f"Lag {i}" for i in range(max_lag + 1)]
    correlation_data = {
        'Cross-Correlation': ccf_values,
        'Significance': ccf_significance
    }

    # Create DataFrame
    correlation_df = pd.DataFrame(correlation_data, index=lag_labels)
    correlation_df['Cross-Correlation'] = round(correlation_df['Cross-Correlation'],5).astype(str)+' '+ correlation_df['Significance']
    correlation_df = correlation_df.drop(['Significance'], axis=1)
    return correlation_df

## Load Data

Mortality Residuals

In [None]:
mortality_residuals = pd.read_csv('/content/drive/MyDrive/Diversa/PUCE/Mortality_residuals.csv', sep='|', index_col=0, parse_dates=True)
mortality_residuals.head()

Unnamed: 0_level_0,Mortality,Mortality_female,Mortality_male,Mortality_teens,Mortality_young_adults,Mortality_adults,Mortality_seniors,Mortality_amazon,Mortality_coastal,Mortality_highlands
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-01-31,94.0,9.213094,66.0,8.43096,47.0,20.0,3.0,-0.767161,34.0,55.0
2011-02-28,-21.989196,-2.206936,-11.99551,-2.609099,-12.997137,1.000548,2.3e-05,2.348104,-7.998619,-16.997262
2011-03-31,-3.202423,6.902525,-8.541581,-0.17959,2.602775,-8.42117,3.000011,4.02198,3.074348,-9.834973
2011-04-30,-41.893321,-7.871678,-30.016394,-5.895224,-28.863339,-4.044618,-1.061441,-6.450727,-18.012415,-15.756482
2011-05-31,10.217268,0.715238,8.971134,3.993628,-1.224284,-1.166328,4.23747,2.387704,-5.948689,12.053678


Google Trends Residuals

In [None]:
trends_residuals = pd.read_csv('/content/drive/MyDrive/Diversa/PUCE/Google Trends Data/GoogleTrends_residuals.csv', sep='|', index_col=0, parse_dates=True)
trends_residuals.head()

Unnamed: 0_level_0,Digital,Stress,Trauma,General,Mood,Minorities,Disorders,Prevention,Seeking
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-01-31,2.038835,30.373748,9.227926,43.0,23.064474,14.359522,26.015596,27.093407,17.99726
2011-02-28,-2.038828,-4.302977,-1.439408,20.003253,7.931973,-7.443862,-4.160258,3.517626,-5.49573
2011-03-31,3.25306,-6.535828,-1.846751,-0.186903,-1.352257,-4.048088,0.88116,-12.471474,3.787847
2011-04-30,-3.366732,1.715804,-0.154434,-4.120744,-3.113637,6.903609,-3.012997,-0.008627,-7.998035
2011-05-31,-0.149571,-3.229735,1.989975,6.040702,1.29401,6.376044,3.406526,-4.396217,-0.781216


# Correlations

Loop to get all the 90 correlations between mortality time series and trends time series residuals. The results will be stored in a dataframe inside a dictionary

In [None]:
correlation_results = {}  # To store the correlation DataFrames as a dictionary
mortality_columns = mortality_residuals.columns
trends_columns = trends_residuals.columns

# Looping through each combination of mortality and trends columns
for mortality_col in mortality_columns:
    for trend_col in trends_columns:
        # Compute correlations
        result_df = compute_correlations(mortality_residuals[mortality_col], trends_residuals[trend_col])
        result_df = result_df.rename(columns={"Cross-Correlation":''})

        # Construct the name for the result DataFrame
        result_name = f'{mortality_col}X{trend_col}'
        # Store the result with the name
        correlation_results[result_name] = result_df

        # Print progress
        print(f"Completed correlation: {result_name}")

Completed correlation: MortalityXDigital
Completed correlation: MortalityXStress
Completed correlation: MortalityXTrauma
Completed correlation: MortalityXGeneral
Completed correlation: MortalityXMood
Completed correlation: MortalityXMinorities
Completed correlation: MortalityXDisorders
Completed correlation: MortalityXPrevention
Completed correlation: MortalityXSeeking
Completed correlation: Mortality_femaleXDigital
Completed correlation: Mortality_femaleXStress
Completed correlation: Mortality_femaleXTrauma
Completed correlation: Mortality_femaleXGeneral
Completed correlation: Mortality_femaleXMood
Completed correlation: Mortality_femaleXMinorities
Completed correlation: Mortality_femaleXDisorders
Completed correlation: Mortality_femaleXPrevention
Completed correlation: Mortality_femaleXSeeking
Completed correlation: Mortality_maleXDigital
Completed correlation: Mortality_maleXStress
Completed correlation: Mortality_maleXTrauma
Completed correlation: Mortality_maleXGeneral
Completed c

Results

In [None]:
# Concatenate all DataFrames in the dictionary to form a wide DataFrame
results = pd.concat(correlation_results, axis=1)
# Columns names
names = ['MortalityXDigital', 'MortalityXStress', 'MortalityXTrauma', 'MortalityXGeneral', 'MortalityXMood', 'MortalityXMinorities', 'MortalityXDisorders', 'MortalityXPrevention', 'MortalityXSeeking',
         'Mortality_femaleXDigital', 'Mortality_femaleXStress', 'Mortality_femaleXTrauma', 'Mortality_femaleXGeneral', 'Mortality_femaleXMood', 'Mortality_femaleXMinorities', 'Mortality_femaleXDisorders', 'Mortality_femaleXPrevention', 'Mortality_femaleXSeeking',
         'Mortality_maleXDigital', 'Mortality_maleXStress', 'Mortality_maleXTrauma', 'Mortality_maleXGeneral', 'Mortality_maleXMood', 'Mortality_maleXMinorities', 'Mortality_maleXDisorders', 'Mortality_maleXPrevention', 'Mortality_maleXSeeking',
         'Mortality_teensXDigital', 'Mortality_teensXStress', 'Mortality_teensXTrauma', 'Mortality_teensXGeneral', 'Mortality_teensXMood', 'Mortality_teensXMinorities', 'Mortality_teensXDisorders', 'Mortality_teensXPrevention', 'Mortality_teensXSeeking',
         'Mortality_young_adultsXDigital', 'Mortality_young_adultsXStress', 'Mortality_young_adultsXTrauma', 'Mortality_young_adultsXGeneral', 'Mortality_young_adultsXMood', 'Mortality_young_adultsXMinorities', 'Mortality_young_adultsXDisorders', 'Mortality_young_adultsXPrevention', 'Mortality_young_adultsXSeeking',
         'Mortality_adultsXDigital', 'Mortality_adultsXStress', 'Mortality_adultsXTrauma', 'Mortality_adultsXGeneral', 'Mortality_adultsXMood', 'Mortality_adultsXMinorities', 'Mortality_adultsXDisorders', 'Mortality_adultsXPrevention', 'Mortality_adultsXSeeking',
         'Mortality_seniorsXDigital', 'Mortality_seniorsXStress', 'Mortality_seniorsXTrauma', 'Mortality_seniorsXGeneral', 'Mortality_seniorsXMood', 'Mortality_seniorsXMinorities', 'Mortality_seniorsXDisorders', 'Mortality_seniorsXPrevention', 'Mortality_seniorsXSeeking',
         'Mortality_amazonXDigital', 'Mortality_amazonXStress', 'Mortality_amazonXTrauma', 'Mortality_amazonXGeneral', 'Mortality_amazonXMood', 'Mortality_amazonXMinorities', 'Mortality_amazonXDisorders', 'Mortality_amazonXPrevention', 'Mortality_amazonXSeeking',
         'Mortality_coastalXDigital', 'Mortality_coastalXStress', 'Mortality_coastalXTrauma', 'Mortality_coastalXGeneral', 'Mortality_coastalXMood', 'Mortality_coastalXMinorities', 'Mortality_coastalXDisorders', 'Mortality_coastalXPrevention', 'Mortality_coastalXSeeking',
         'Mortality_highlandsXDigital', 'Mortality_highlandsXStress', 'Mortality_highlandsXTrauma', 'Mortality_highlandsXGeneral', 'Mortality_highlandsXMood', 'Mortality_highlandsXMinorities', 'Mortality_highlandsXDisorders', 'Mortality_highlandsXPrevention', 'Mortality_highlandsXSeeking']
results.columns = names
# Display the first few rows of the wide DataFrame
results.head(13)

Unnamed: 0,MortalityXDigital,MortalityXStress,MortalityXTrauma,MortalityXGeneral,MortalityXMood,MortalityXMinorities,MortalityXDisorders,MortalityXPrevention,MortalityXSeeking,Mortality_femaleXDigital,...,Mortality_coastalXSeeking,Mortality_highlandsXDigital,Mortality_highlandsXStress,Mortality_highlandsXTrauma,Mortality_highlandsXGeneral,Mortality_highlandsXMood,Mortality_highlandsXMinorities,Mortality_highlandsXDisorders,Mortality_highlandsXPrevention,Mortality_highlandsXSeeking
Lag 0,-0.01018,0.54878 ***,0.3034 ***,0.27392 ***,0.34234 ***,0.19133 ***,0.43526 ***,0.43866 ***,0.3287 ***,0.00293,...,0.17815 ***,-0.01346,0.52964 ***,0.25018 ***,0.189 ***,0.24314 ***,0.21374 ***,0.34868 ***,0.39012 ***,0.34511 ***
Lag 1,-0.08068,-0.09573,-0.12138,0.01415,-0.09349,0.02226,-0.09228,-0.01089,0.04209,-0.09328,...,0.10201,-0.0165,-0.08343,-0.0934,0.02159,-0.18926 ***,0.02678,-0.11728,-0.03394,-0.00063
Lag 2,0.05243,0.10607,0.00053,-0.0436,-0.09471,0.06291,-0.01686,-0.05248,-0.04387,-0.02725,...,0.03173,-0.01404,0.00358,-0.03207,-0.03111,-0.165 *,0.03825,-0.0765,-0.10012,-0.08605
Lag 3,-0.01175,-0.13642,-0.11002,-0.19196 ***,-0.08823,-0.06472,-0.19082 ***,-0.20334 ***,-0.15031 *,-0.0317,...,-0.05416,0.00897,-0.06377,-0.02943,-0.13354,0.04028,-0.02631,-0.07943,-0.10679,-0.16105 *
Lag 4,-0.03095,0.09146,0.04497,-0.00084,0.09537,-0.13185,0.08509,0.00157,-0.03842,-0.08614,...,-0.03718,-0.02775,0.09415,0.04422,0.08215,0.13202,-0.09323,0.10406,0.09841,-0.06276
Lag 5,0.06033,0.10709,-0.07916,0.06397,-0.02858,-0.07331,0.03347,0.01801,0.04889,0.16155 *,...,-0.02811,0.04796,0.15417 *,-0.09959,0.00998,-0.03939,-0.03605,0.03555,0.02558,0.05061
Lag 6,0.01375,-0.04191,-0.03271,-0.1136,-0.05554,0.00429,-0.00664,0.01321,-0.0585,0.0738,...,-0.05554,-0.05666,-0.09067,-0.06258,-0.17512 ***,-0.08573,0.05085,0.00423,-0.03915,-0.03306
Lag 7,-0.15711 *,-0.037,0.01905,-0.07108,-0.10348,-0.00611,-0.12587,0.00349,0.02516,-0.16893 *,...,0.07191,-0.11137,-0.05069,0.08619,-0.12256,-0.07464,0.03185,-0.12678,-0.0225,-0.00707
Lag 8,-0.00154,0.02603,-0.01226,0.0527,0.01458,0.12319,0.0644,0.06028,0.01519,0.03308,...,0.05854,0.01387,0.05362,-0.06409,0.01792,-0.01345,0.09574,0.06106,0.02428,0.02351
Lag 9,-0.00295,0.04793,-0.0596,0.05487,-0.00153,0.00488,0.09536,0.04308,-0.13127,-0.03686,...,-0.07645,-0.00774,-0.00415,-0.06396,0.06072,-0.02764,0.03029,0.1337,0.00156,-0.0955


In [None]:
# Export the DataFrame to a CSV file delimited by pipes '|'
results.to_csv('/content/drive/MyDrive/Diversa/PUCE/Correlation_results.csv', sep='|', index=True)