# Appendix: DATA70302 Assignment 2

## Topological Data Analysis

In this assignment, we will conduct a Topological Data Analysis (TDA) of Google Trends data, using *pytrends* and *gudhi* libraries in Python. By selecting 5 keywords related to political issues and elections in the United Kingdom, weekly time series data from the last 5 years will be collected. 

In [1]:
# Libraries
import pandas as pd
from functools import reduce
import math
import numpy as np
from pytrends.request import TrendReq
import gudhi as gd
import gudhi.wasserstein
import gudhi.hera
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.stats import skew, kurtosis
from scipy.stats import spearmanr

In [2]:
def is_finite_tuple(tpl):
    return all(math.isfinite(item) for item in tpl)

In [3]:
# Extract data
# pytrends = TrendReq(hl='en-GB', tz=0, retries=10)
# timeframe = 'today 5-y'   # Time frame 

### Fuel prices
# keyword = ['fuel prices'] # Search term

# pytrends.build_payload(kw_list=keyword,
#                       cat=0, # Category 
#                       timeframe=timeframe,
#                       geo='GB-ENG', # Geographic location
#                       gprop='') # Google Search Property

# interest_over_time_df = pytrends.interest_over_time() # Store the interest rate

# fuelp = interest_over_time_df.reset_index() 
# fuelp.drop('isPartial', axis=1, inplace= True)


### Petrol prices
# keyword = ['petrol prices'] # Search term

# pytrends.build_payload(kw_list=keyword,
#                       cat=0, # Category 
#                       timeframe=timeframe,
#                       geo='GB-ENG', # Geographic location
#                       gprop='') # Google Search Property

# interest_over_time_df = pytrends.interest_over_time() # Store the interest rate

# petrolp = interest_over_time_df.reset_index() 
# petrolp.drop('isPartial', axis=1, inplace= True)


### Diesel prices
# keyword = ['diesel prices'] # Search term

# pytrends.build_payload(kw_list=keyword,
#                       cat=0, # Category 
#                       timeframe=timeframe,
#                       geo='GB-ENG', # Geographic location
#                       gprop='') # Google Search Property

# interest_over_time_df = pytrends.interest_over_time() # Store the interest rate

# dieselp = interest_over_time_df.reset_index() 
# dieselp.drop('isPartial', axis=1, inplace= True)


### Job seekers
# keyword = ['job seekers'] # Search term

# pytrends.build_payload(kw_list=keyword,
#                       cat=0, # Category 
#                       timeframe=timeframe,
#                       geo='GB-ENG', # Geographic location
#                       gprop='') # Google Search Property

# interest_over_time_df = pytrends.interest_over_time() # Store the interest rate

# jobseek = interest_over_time_df.reset_index() 
# jobseek.drop('isPartial', axis=1, inplace= True)

### Job seekers allowance
# keyword = ['job seekers allowance'] # Search term

# pytrends.build_payload(kw_list=keyword,
#                        cat=0, # Category 
#                       timeframe=timeframe,
#                       geo='GB-ENG', # Geographic location
#                       gprop='') # Google Search Property

# interest_over_time_df = pytrends.interest_over_time() # Store the interest rate

# jobal = interest_over_time_df.reset_index() 
# jobal.drop('isPartial', axis=1, inplace= True)


### Pensions
# keyword = ['pensions'] # Search term

# pytrends.build_payload(kw_list=keyword,
#                       cat=0, # Category 
#                       timeframe=timeframe,
#                       geo='GB-ENG', # Geographic location
#                       gprop='') # Google Search Property

# interest_over_time_df = pytrends.interest_over_time() # Store the interest rate

# pensions = interest_over_time_df.reset_index() 
# pensions.drop('isPartial', axis=1, inplace= True)


### Export data:
# fuelp.to_csv("fuelp.csv",index=False)
# petrolp.to_csv("petrolp.csv",index=False)
# dieselp.to_csv("dieselp.csv",index=False)
# jobseek.to_csv("jobseekers.csv",index=False)
# jobal.to_csv("jobal.csv",index=False)
# pensions.to_csv("pensions.csv",index=False)

## 1. Import data

In [4]:
# Load datasets
fuelp = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Topological Data Analysis/A2/data/fuelp.csv")
petrolp = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Topological Data Analysis/A2/data/petrolp.csv")
dieselp = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Topological Data Analysis/A2/data/dieselp.csv")
jobseek = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Topological Data Analysis/A2/data/jobseekers.csv")
allowance = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Topological Data Analysis/A2/data/jobal.csv")
pension = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Topological Data Analysis/A2/data/pensions.csv")

# List of all DataFrames to merge
dfs = [fuelp, petrolp, dieselp, jobseek, allowance, pension]

# Date
for i in range(len(dfs)):
    dfs[i]['date'] = pd.to_datetime(dfs[i]['date'], format="%d/%m/%y").dt.strftime("%Y-%m-%d")

# Merge all on "date"
merged_df = reduce(lambda left, right: pd.merge(left, right, on="date", how="outer"), dfs)

# Date format
merged_df["date"] = pd.to_datetime(merged_df["date"])

# Preview result
print(merged_df.head())

        date  fuel prices  petrol prices  diesel prices  job seekers  \
0 2020-04-12            8              8              9           99   
1 2020-04-19           19             25             19          100   
2 2020-04-26            9              7              9           82   
3 2020-05-03            7              5              5           74   
4 2020-05-10           15             16             14           79   

   job seekers allowance  pensions  
0                     95        38  
1                     97        39  
2                     84        45  
3                     70        35  
4                     74        43  


## 2. Time Series

In [5]:
# Size:
plt.figure(figsize=(12, 6))

# Set consistent colours manually
colors = {
    "Prices": "#1f77b4",
    "Economy": "#ff7f0e",
    "Pensions": "#2ca02c"
}

# Plot each trend line with matching color and different line styles
sb.lineplot(x="date", y="fuel prices", data=merged_df, label="Fuel Prices", color=colors["Prices"], linestyle="-")
sb.lineplot(x="date", y="petrol prices", data=merged_df, label="Petrol Prices", color=colors["Prices"], linestyle="--")
sb.lineplot(x="date", y="diesel prices", data=merged_df, label="Diesel Prices", color=colors["Prices"], linestyle=":")
sb.lineplot(x="date", y="job seekers", data=merged_df, label="Job Seekers", color=colors["Economy"], linestyle="-")
sb.lineplot(x="date", y="job seekers allowance", data=merged_df, label="Job Seekers Allowance", color=colors["Economy"], linestyle="--")
sb.lineplot(x="date", y="pensions", data=merged_df, label="Pensions", color=colors["Pensions"], linestyle="-")

# Labels and formatting
plt.ylabel("Google Search Interest")
plt.xlabel("Date")

# Format x-axis: every 4 months
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=4))
plt.xticks(rotation=45)

plt.legend(title="Search Term")
plt.tight_layout()

# Save the figure to file
plt.savefig("search_trends.png", dpi=300)

# Close the figure so it's not displayed or stored in notebook memory
plt.close()

In [6]:
# Remove 'date' column for statistics
df_numeric = merged_df.drop(columns=["date"])

# Calculate descriptive statistics
desc_stats = df_numeric.describe(percentiles=[.25, .5, .75]).T

# Add skewness and kurtosis
desc_stats["skew"] = df_numeric.skew()
desc_stats["kurtosis"] = df_numeric.kurtosis()

# Rename columns
desc_stats = desc_stats.rename(columns={
    "mean": "Mean",
    "std": "SD",
    "min": "Min",
    "25%": "Q25",
    "50%": "Q50",
    "75%": "Q75",
    "max": "Max",
    "skew": "Skew",
    "kurtosis": "Kurt"
})

desc_stats.style.set_caption("Descriptive Statistics")

plt.close()

## 3. Lagged Cloud

In [7]:
# Extract and prepare fuel prices series
fuel_series = merged_df[['date', 'fuel prices']].copy().dropna().reset_index(drop=True)

# Create lagged features
for lag in range(1, 5):
    fuel_series[f'lag{lag}'] = fuel_series['fuel prices'].shift(lag)

# Drop rows with NaNs introduced by lagging
fuelP_lags = fuel_series.dropna().reset_index(drop=True)

# Lags
lags = [f"lag{l}" for l in range(1, 5)]

# Fig
fig, axes = plt.subplots(1, 4, figsize=(12, 5), sharey=True)

for i, lag in enumerate(lags):
    sb.scatterplot(
        x=lag,
        y="fuel prices",
        data=fuelP_lags,
        ax=axes[i],
        s=20
    )
    axes[i].set_xlabel(f"{lag.capitalize()}")
    axes[i].set_ylabel("Fuel Prices" if i == 0 else "")  # label only first plot for clarity
    axes[i].set_title("")  # no title

    # Compute and display correlation
    corr = np.corrcoef(fuelP_lags[lag], fuelP_lags["fuel prices"])[0, 1]
    axes[i].annotate(f"r = {corr:.2f}", xy=(0.5, -0.2), xycoords="axes fraction",
                     ha='center', fontsize=10)

plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/fuel.png", dpi=300) 

plt.close()

In [8]:
# Extract and prepare petrol prices series
petrol_series = merged_df[['date', 'petrol prices']].copy().dropna().reset_index(drop=True)

# Create lagged features
for lag in range(1, 5):
    petrol_series[f'lag{lag}'] = petrol_series['petrol prices'].shift(lag)

# Drop rows with NaNs introduced by lagging
petrolP_lags = petrol_series.dropna().reset_index(drop=True)

# Fig
fig, axes = plt.subplots(1, 4, figsize=(12, 5), sharey=True)

for i, lag in enumerate(lags):
    sb.scatterplot(
        x=lag,
        y="petrol prices",
        data=petrolP_lags,
        ax=axes[i],
        s=20
    )
    axes[i].set_xlabel(f"{lag.capitalize()}")
    axes[i].set_ylabel("Petrol Prices" if i == 0 else "")
    axes[i].set_title("")

    # Compute and display correlation
    corr = np.corrcoef(petrolP_lags[lag], petrolP_lags["petrol prices"])[0, 1]
    axes[i].annotate(f"r = {corr:.2f}", xy=(0.5, -0.2), xycoords="axes fraction",
                     ha='center', fontsize=10)

plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/petrol.png", dpi=300) 

plt.close()

In [9]:
# Extract and prepare diesel prices series
diesel_series = merged_df[['date', 'diesel prices']].copy().dropna().reset_index(drop=True)

# Create lagged features
for lag in range(1, 5):
    diesel_series[f'lag{lag}'] = diesel_series['diesel prices'].shift(lag)

# Drop rows with NaNs introduced by lagging
dieselP_lags = diesel_series.dropna().reset_index(drop=True)

# Define lag columns
lags = [f'lag{l}' for l in range(1, 5)]

# Create the plot
fig, axes = plt.subplots(1, 4, figsize=(12, 5), sharey=True)

for i, lag in enumerate(lags):
    sb.scatterplot(
        x=lag,
        y="diesel prices",
        data=dieselP_lags,
        ax=axes[i],
        s=20
    )
    axes[i].set_xlabel(f"{lag.capitalize()}")
    axes[i].set_ylabel("Diesel Prices" if i == 0 else "")
    axes[i].set_title("")

    # Compute and display correlation
    corr = np.corrcoef(dieselP_lags[lag], dieselP_lags["diesel prices"])[0, 1]
    axes[i].annotate(f"r = {corr:.2f}", xy=(0.5, -0.2), xycoords="axes fraction",
                     ha='center', fontsize=10)

plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/diesel.png", dpi=300) 

plt.close()

In [10]:
# Extract and prepare job seekers series
jobseek_series = merged_df[['date', 'job seekers']].copy().dropna().reset_index(drop=True)

# Create lagged features
for lag in range(1, 5):
    jobseek_series[f'lag{lag}'] = jobseek_series['job seekers'].shift(lag)

# Drop rows with NaNs introduced by lagging
jobseek_lags = jobseek_series.dropna().reset_index(drop=True)

# Define lag columns
lags = [f'lag{l}' for l in range(1, 5)]

# Create the plot
fig, axes = plt.subplots(1, 4, figsize=(12, 5), sharey=True)

for i, lag in enumerate(lags):
    sb.scatterplot(
        x=lag,
        y="job seekers",
        data=jobseek_lags,
        ax=axes[i],
        s=20
    )
    axes[i].set_xlabel(f"{lag.capitalize()}")
    axes[i].set_ylabel("Job Seekers" if i == 0 else "")
    axes[i].set_title("")

    # Compute and display correlation
    corr = np.corrcoef(jobseek_lags[lag], jobseek_lags["job seekers"])[0, 1]
    axes[i].annotate(f"r = {corr:.2f}", xy=(0.5, -0.2), xycoords="axes fraction",
                     ha='center', fontsize=10)

plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/job.png", dpi=300) 

plt.close()

In [11]:
# Extract and prepare job seekers allowance series
allowance_series = merged_df[['date', 'job seekers allowance']].copy().dropna().reset_index(drop=True)

# Create lagged features
for lag in range(1, 5):
    allowance_series[f'lag{lag}'] = allowance_series['job seekers allowance'].shift(lag)

# Drop rows with NaNs
allowance_lags = allowance_series.dropna().reset_index(drop=True)

# Define lag columns
lags = [f'lag{l}' for l in range(1, 5)]

# Create plot
fig, axes = plt.subplots(1, 4, figsize=(12, 5), sharey=True)

for i, lag in enumerate(lags):
    sb.scatterplot(
        x=lag,
        y="job seekers allowance",
        data=allowance_lags,
        ax=axes[i],
        s=20
    )
    axes[i].set_xlabel(f"{lag.capitalize()}")
    axes[i].set_ylabel("Job Seekers Allowance" if i == 0 else "")
    axes[i].set_title("")

    # Correlation
    corr = np.corrcoef(allowance_lags[lag], allowance_lags["job seekers allowance"])[0, 1]
    axes[i].annotate(f"r = {corr:.2f}", xy=(0.5, -0.2), xycoords="axes fraction",
                     ha='center', fontsize=10)

plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/allowance.png", dpi=300) 

plt.close()

In [12]:
# Extract and prepare pensions series
pension_series = merged_df[['date', 'pensions']].copy().dropna().reset_index(drop=True)

# Create lagged features
for lag in range(1, 5):
    pension_series[f'lag{lag}'] = pension_series['pensions'].shift(lag)

# Drop rows with NaNs
pension_lags = pension_series.dropna().reset_index(drop=True)

# Define lag columns
lags = [f'lag{l}' for l in range(1, 5)]

# Create plot
fig, axes = plt.subplots(1, 4, figsize=(12, 5), sharey=True)

for i, lag in enumerate(lags):
    sb.scatterplot(
        x=lag,
        y="pensions",
        data=pension_lags,
        ax=axes[i],
        s=20
    )
    axes[i].set_xlabel(f"{lag.capitalize()}")
    axes[i].set_ylabel("Pensions" if i == 0 else "")
    axes[i].set_title("")

    # Correlation
    corr = np.corrcoef(pension_lags[lag], pension_lags["pensions"])[0, 1]
    axes[i].annotate(f"r = {corr:.2f}", xy=(0.5, -0.2), xycoords="axes fraction",
                     ha='center', fontsize=10)

plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/pensions.png", dpi=300) 

plt.close()

## 4. Word clouds

In [13]:
# Select keyword columns and drop NaNs
keyword_cols = [
    "fuel prices",
    "petrol prices",
    "diesel prices",
    "job seekers",
    "job seekers allowance",
    "pensions"
]

keywords_df = merged_df[keyword_cols].dropna()

# Create a PairGrid
g = sb.PairGrid(keywords_df, diag_sharey=False)

# Lower triangle: scatterplots
g.map_lower(sb.scatterplot, s=15)

# Upper triangle: KDE distributions
g.map_upper(sb.kdeplot, fill=True)

# Diagonal: KDE plots
g.map_diag(sb.kdeplot, fill=True)

plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/pc.png", dpi=300) 

plt.close()

In [14]:
# Select the six relevant columns
search_terms = [
    "fuel prices", "petrol prices", "diesel prices",
    "job seekers", "job seekers allowance", "pensions"
]

# Compute Pearson and Spearman correlation matrices
pearson_corr = merged_df[search_terms].corr(method="pearson")
spearman_corr = merged_df[search_terms].corr(method="spearman")

# Create a combined matrix with Pearson below and Spearman above the diagonal
combined_corr = pearson_corr.copy()

for i in range(len(search_terms)):
    for j in range(len(search_terms)):
        if i < j:
            combined_corr.iloc[i, j] = spearman_corr.iloc[i, j]
        elif i > j:
            combined_corr.iloc[i, j] = pearson_corr.iloc[i, j]
        else:
            combined_corr.iloc[i, j] = 1.0  # Diagonal remains 1.0

# Round for display
combined_corr = combined_corr.round(3)

# combined_corr.to_csv("cm.csv")

## 4. Persistence Homology

In [15]:
# Search terms
pts = merged_df[["fuel prices",
    "petrol prices",
    "diesel prices",
    "job seekers",
    "job seekers allowance",
    "pensions"]].to_numpy()

# Parameters
maximal_radius = 50
max_dim = 2

# Construct Vietoris-Rips complex
rips_cmplx = gd.RipsComplex( points= pts,max_edge_length = maximal_radius ) 

# Extract persistence
simplex_tree = rips_cmplx.create_simplex_tree( max_dimension = max_dim )
pers = simplex_tree.persistence()

In [16]:
# Dimensions 1:
pers1 = pd.DataFrame(simplex_tree.persistence_intervals_in_dimension(1))
pers1.columns = ['Birth','Death']
pers1.head()

Unnamed: 0,Birth,Death
0,3.316625,3.464102
1,3.162278,3.605551
2,3.316625,3.741657
3,3.464102,3.741657
4,3.162278,3.741657


In [17]:
# Dimension 0
pers0 = pd.DataFrame(simplex_tree.persistence_intervals_in_dimension(0))
pers0.columns = ['Birth','Death']
pers0.tail()

Unnamed: 0,Birth,Death
256,0.0,21.447611
257,0.0,21.517435
258,0.0,22.715633
259,0.0,inf
260,0.0,inf


In [18]:
# Plot persistence diagram and export with Garamond font
plt.figure(figsize=(13, 8))

gd.plot_persistence_diagram(pers)
plt.xlabel("Birth", fontname="serif", fontsize=10)
plt.ylabel("Death", fontname="serif", fontsize=10)
plt.title("Persistence Diagram", fontname="serif", fontsize=10)
plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/persistence_diagram.jpeg", format="jpeg", dpi=300)

plt.close()



<Figure size 1300x800 with 0 Axes>

## 5. Time series of persistence norms

### A: Keyword plus 1 lag

In [19]:
# Extract only values of importance for 'fuel prices'
s = merged_df['fuel prices']

# Creating lags0 and lag1
embed2 = pd.concat([s, s.shift()], axis=1).dropna()
embed2.columns = ['Lag0','Lag1']

In [20]:
# Set radius and max dim:
maximal_radius = 50
max_dim = 2

# Window lenght
ww = 52

# Shape with single lag created before
r,c = embed2.shape
mydata2 = embed2.iloc[:,0:c]

# Define upper limit
r2 = r-ww

In [21]:
# Blank storage arrays
BMED = []
BMEDd = []
l1_norms_dim_0 = []
l2_norms_dim_0 = []
l1_norms_dim_1 = []
l2_norms_dim_1 = []
diag_dim_1 = []
diag_dim_0 = []

In [22]:
# Run the collection loop of persistence norms
for i in range(0,r2):
 a001 = i
 a002 = i+ww-1
 mydata3 = mydata2.iloc[a001:a002,:]
 mydata4 = mydata3.to_numpy()
 rips_cmplx = gd.RipsComplex(points = mydata4,max_edge_length = maximal_radius) 
 simplex_tree = rips_cmplx.create_simplex_tree( max_dimension = max_dim )
 pers = simplex_tree.persistence()
 pers = [tpl for tpl in pers if is_finite_tuple(tpl[1])]
 #Let us now compute the norms in dimension 1
 l1normDim1_local = 0
 l2normDim1_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 1):
   l1normDim1_local = l1normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim1_local = l2normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )  
   diag_dim_1.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_1.append(l1normDim1_local)
 l2_norms_dim_1.append(math.sqrt(l2normDim1_local))
 #Let us now compute the norms in dimension 0
 l1normDim0_local = 0
 l2normDim0_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 0):
   l1normDim0_local = l1normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim0_local = l2normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )
   diag_dim_0.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_0.append(l1normDim0_local)
 l2_norms_dim_0.append(math.sqrt(l2normDim0_local)) 
 BMED.append([a002,l1normDim0_local,l1normDim1_local,l2normDim0_local,l2normDim1_local ] )

In [23]:
# Into a dataframe
BMEDDFL2 = pd.DataFrame(BMED)
BMEDDFL2.columns = ['Obs','L10','L11','L20','L21']

# Diagrams for use in the Wasserstein distance
Adiag_dim_0L2 = diag_dim_0
Adiag_dim_1L2 = diag_dim_1

In [24]:
# Merge with original:
dff = merged_df.iloc[:, :2]

lag=1
dffaL2 = dff
dffaL2['Obs'] = dffaL2.index
dffaL2.loc[:, "Obs"] = dffaL2["Obs"].apply(lambda x: x - lag)

# Final merge
dffbL2 = dffaL2.merge(BMEDDFL2,on='Obs')

In [25]:
# Melt the data
df_long = dffbL2.melt(id_vars=['date'], value_vars=['L11', 'L21'], 
                      var_name='Type', value_name='Persistence Norm')

# Rename for legend
df_long['Type'] = df_long['Type'].replace({'L11': 'L1', 'L21': 'L2'})

# Plot
plt.figure(figsize=(10, 6))
sb.lineplot(x='date', y='Persistence Norm', hue='Type', data=df_long)
plt.ylabel("Persistence Norms")
plt.xlabel("Date")
plt.title("L1 and L2 Persistence Norms Over Time")
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/A.jpeg", format="jpeg", dpi=300)

plt.close()


### B: Keyword plus 4 lags

In [26]:
# Creating from lag0 to lag4
embed5 = pd.concat([s, s.shift(), s.shift(2), s.shift(3), s.shift(4)], axis=1).dropna()
embed5.columns = ['Lag0','Lag1', 'Lag2', 'Lag3', 'Lag4']

In [27]:
# Shape with single lag created before
r,c = embed5.shape
mydata2 = embed5.iloc[:,0:5]

# Define upper limit
r2 = r-ww

In [28]:
# Empty arrays
BMED = []
BMEDd = []
l1_norms_dim_0 = []
l2_norms_dim_0 = []
l1_norms_dim_1 = []
l2_norms_dim_1 = []
diag_dim_1 = []
diag_dim_0 = []

In [29]:
# Run the collection loop of persistence norms
for i in range(0,r2):
 a001 = i
 a002 = i+ww-1
 mydata3 = mydata2.iloc[a001:a002,:]
 mydata4 = mydata3.to_numpy()
 rips_cmplx = gd.RipsComplex(points = mydata4,max_edge_length = maximal_radius) 
 simplex_tree = rips_cmplx.create_simplex_tree( max_dimension = max_dim )
 pers = simplex_tree.persistence()
 pers = [tpl for tpl in pers if is_finite_tuple(tpl[1])]
 #Let us now compute the norms in dimension 1
 l1normDim1_local = 0
 l2normDim1_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 1):
   l1normDim1_local = l1normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim1_local = l2normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )  
   diag_dim_1.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_1.append(l1normDim1_local)
 l2_norms_dim_1.append(math.sqrt(l2normDim1_local))
 #Let us now compute the norms in dimension 0
 l1normDim0_local = 0
 l2normDim0_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 0):
   l1normDim0_local = l1normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim0_local = l2normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )
   diag_dim_0.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_0.append(l1normDim0_local)
 l2_norms_dim_0.append(math.sqrt(l2normDim0_local)) 
 BMED.append([a002,l1normDim0_local,l1normDim1_local,l2normDim0_local,l2normDim1_local ] )

In [30]:
# Into a dataframe
BMEDDFL2 = pd.DataFrame(BMED)
BMEDDFL2.columns = ['Obs','L10','L11','L20','L21']

# Diagrams for use in the Wasserstein distance
Bdiag_dim_0L2 = diag_dim_0
Bdiag_dim_1L2 = diag_dim_1

In [31]:
# Merge with original:
dff = merged_df.iloc[:, :7]

lag=1
dffaL2 = dff
dffaL2['Obs'] = dffaL2.index
dffaL2.loc[:, "Obs"] = dffaL2["Obs"].apply(lambda x: x - lag)

# Final merge
dffbL2 = dffaL2.merge(BMEDDFL2,on='Obs')

In [32]:
# Melt the data
df_long = dffbL2.melt(id_vars=['date'], value_vars=['L11', 'L21'], 
                      var_name='Type', value_name='Persistence Norm')

# Rename for legend
df_long['Type'] = df_long['Type'].replace({'L11': 'L1', 'L21': 'L2'})

# Plot
plt.figure(figsize=(10, 6))
sb.lineplot(x='date', y='Persistence Norm', hue='Type', data=df_long)
plt.ylabel("Persistence Norms")
plt.xlabel("Date")
plt.title("L1 and L2 Persistence Norms Over Time")
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/B.jpeg", format="jpeg", dpi=300)

plt.close()


### C: Keyword plus 1 related search

In [33]:
# First and second keyword
newdf2W2 = merged_df.iloc[:, :3]

In [34]:
# Parameters:
maximal_radius = 50
max_dim = 2

ww = 52

r,c = newdf2W2.shape
r2 = r-ww

In [35]:
# Without date
mydata2 = newdf2W2.iloc[:,1:c]

In [36]:
# Empty arrays
BMED = []
BMEDd = []
l1_norms_dim_0 = []
l2_norms_dim_0 = []
l1_norms_dim_1 = []
l2_norms_dim_1 = []
diag_dim_1 = []
diag_dim_0 = []

In [37]:
for i in range(0,r2):
 a001 = i
 a002 = i+ww-1
 mydata3 = mydata2.iloc[a001:a002,:]
 mydata4 = mydata3.to_numpy()
 rips_cmplx = gd.RipsComplex(points = mydata4,max_edge_length = maximal_radius) 
 simplex_tree = rips_cmplx.create_simplex_tree( max_dimension = max_dim )
 pers = simplex_tree.persistence()
 pers = [tpl for tpl in pers if is_finite_tuple(tpl[1])]
 #Let us now compute the norms in dimension 1
 l1normDim1_local = 0
 l2normDim1_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 1):
   l1normDim1_local = l1normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim1_local = l2normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )  
   diag_dim_1.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_1.append(l1normDim1_local)
 l2_norms_dim_1.append(math.sqrt(l2normDim1_local))
 #Let us now compute the norms in dimension 0
 l1normDim0_local = 0
 l2normDim0_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 0):
   l1normDim0_local = l1normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim0_local = l2normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )
   diag_dim_0.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_0.append(l1normDim0_local)
 l2_norms_dim_0.append(math.sqrt(l2normDim0_local)) 
 BMED.append([a002,l1normDim0_local,l1normDim1_local,l2normDim0_local,l2normDim1_local ] )

In [38]:
BMEDDF2W2 = pd.DataFrame(BMED)
BMEDDF2W2.columns = ['Obs','L10','L11','L20','L21']

In [39]:
Cdiag_dim_0W2 = diag_dim_0
Cdiag_dim_1W2 = diag_dim_1

In [40]:
newdf2aW2 = newdf2W2
newdf2aW2['Obs'] = newdf2aW2.index

In [41]:
newdf2bW2 = newdf2aW2.merge(BMEDDF2W2,on='Obs')

In [42]:
# Melt the data
df_long = newdf2bW2.melt(id_vars=['date'], value_vars=['L11', 'L21'], 
                      var_name='Type', value_name='Persistence Norm')

# Rename for legend
df_long['Type'] = df_long['Type'].replace({'L11': 'L1', 'L21': 'L2'})

# Plot
plt.figure(figsize=(10, 6))
sb.lineplot(x='date', y='Persistence Norm', hue='Type', data=df_long)
plt.ylabel("Persistence Norms")
plt.xlabel("Date")
plt.title("L1 and L2 Persistence Norms Over Time")
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/C.jpeg", format="jpeg", dpi=300)

plt.close()


In [43]:
# Filter to ensure we're summarising only rows with persistence norms
norms_df = newdf2bW2.dropna(subset=['L10', 'L11', 'L20', 'L21'])

# Function to compute summary stats for a column or list
def summarise_column(col):
    return {
        'Mean': np.mean(col),
        'SD': np.std(col),
        'Min': np.min(col),
        'Q25': np.percentile(col, 25),
        'Median': np.median(col),
        'Q75': np.percentile(col, 75),
        'Max': np.max(col),
        'Skew': skew(col),
        'Kurtosis': kurtosis(col)
    }

# Create summary table for each of the persistence norm columns
summary_data = {}
for col in ['L11', 'L21', 'L10', 'L20']:
    summary_data[col] = summarise_column(norms_df[col])

# === Now compute average STD and correlation across same 52-week windows ===
keywords = ['fuel prices', 'petrol prices']
X = newdf2bW2[keywords]
window_size = 52
avg_stds = []
avg_corrs = []

for i in range(len(X) - window_size):
    window = X.iloc[i:i + window_size]
    stds = window.std()
    corr_matrix = window.corr()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    avg_stds.append(stds.mean())
    avg_corrs.append(upper_triangle.stack().mean())

# Add their summaries
summary_data['Avg_STD_terms'] = summarise_column(avg_stds)
summary_data['Avg_Correlation'] = summarise_column(avg_corrs)

# Convert to DataFrame
summary_df = pd.DataFrame(summary_data).T
summary_df.index.name = 'Metric'

# (Optional) Display in notebook or export
summary_df.style.set_caption("Summary Table")
# summary_df.to_csv("norms2.csv")

Unnamed: 0_level_0,Mean,SD,Min,Q25,Median,Q75,Max,Skew,Kurtosis
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
L11,2.410789,0.63398,1.242641,2.0,2.414214,2.656854,4.249213,0.66621,0.114358
L21,1.411712,0.997606,0.454602,0.857864,1.029437,1.230447,4.261165,1.722548,1.511572
L10,59.98882,47.929889,13.414214,22.990705,35.505046,105.464934,147.704915,0.750469,-1.126821
L20,329.595238,395.165418,14.0,31.25,60.0,853.5,1600.0,0.83954,-0.919261
Avg_STD_terms,7.814671,7.008766,1.071892,1.995032,2.547577,16.359632,18.556411,0.464994,-1.593036
Avg_Correlation,0.855378,0.120446,0.588196,0.774141,0.869024,0.973844,0.983724,-0.51088,-1.047241


### D: Keyword plus 5 related searches

In [44]:
# All data:
newdf2W2 = merged_df.iloc[:, :7]

In [45]:
# Parameters:
maximal_radius = 50
max_dim = 2

ww = 52

r,c = newdf2W2.shape
r2 = r-ww

In [46]:
# Without date
mydata2 = newdf2W2.iloc[:,1:c]

In [47]:
# Empty arrays
BMED = []
BMEDd = []
l1_norms_dim_0 = []
l2_norms_dim_0 = []
l1_norms_dim_1 = []
l2_norms_dim_1 = []
diag_dim_1 = []
diag_dim_0 = []

In [48]:
for i in range(0,r2):
 a001 = i
 a002 = i+ww-1
 mydata3 = mydata2.iloc[a001:a002,:]
 mydata4 = mydata3.to_numpy()
 rips_cmplx = gd.RipsComplex(points = mydata4,max_edge_length = maximal_radius) 
 simplex_tree = rips_cmplx.create_simplex_tree( max_dimension = max_dim )
 pers = simplex_tree.persistence()
 pers = [tpl for tpl in pers if is_finite_tuple(tpl[1])]
 #Let us now compute the norms in dimension 1
 l1normDim1_local = 0
 l2normDim1_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 1):
   l1normDim1_local = l1normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim1_local = l2normDim1_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )  
   diag_dim_1.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_1.append(l1normDim1_local)
 l2_norms_dim_1.append(math.sqrt(l2normDim1_local))
 #Let us now compute the norms in dimension 0
 l1normDim0_local = 0
 l2normDim0_local = 0
 for ii in range(0,len(pers)):
  if (pers[ii][0] == 0):
   l1normDim0_local = l1normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )
   l2normDim0_local = l2normDim0_local+( pers[ii][1][1]-pers[ii][1][0] )*( pers[ii][1][1]-pers[ii][1][0] )
   diag_dim_0.append([a002,pers[ii][1][0],pers[ii][1][1]])
 l1_norms_dim_0.append(l1normDim0_local)
 l2_norms_dim_0.append(math.sqrt(l2normDim0_local)) 
 BMED.append([a002,l1normDim0_local,l1normDim1_local,l2normDim0_local,l2normDim1_local ] )

In [49]:
BMEDDF2W2 = pd.DataFrame(BMED)
BMEDDF2W2.columns = ['Obs','L10','L11','L20','L21']

In [50]:
Ddiag_dim_0W2 = diag_dim_0
Ddiag_dim_1W2 = diag_dim_1

In [51]:
newdf2aW2 = newdf2W2
newdf2aW2['Obs'] = newdf2aW2.index

In [52]:
newdf2bW2 = newdf2aW2.merge(BMEDDF2W2,on='Obs')

In [53]:
# Melt the data
df_long = newdf2bW2.melt(id_vars=['date'], value_vars=['L11', 'L21'], 
                      var_name='Type', value_name='Persistence Norm')

# Rename for legend
df_long['Type'] = df_long['Type'].replace({'L11': 'L1', 'L21': 'L2'})

# Plot
plt.figure(figsize=(10, 6))
sb.lineplot(x='date', y='Persistence Norm', hue='Type', data=df_long)
plt.ylabel("Persistence Norms")
plt.xlabel("Date")
plt.title("L1 and L2 Persistence Norms Over Time")
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/D.jpeg", format="jpeg", dpi=300)

plt.close()


In [54]:
# Filter to ensure we're summarising only rows with persistence norms
norms_df = newdf2bW2.dropna(subset=['L10', 'L11', 'L20', 'L21'])

# Function to compute summary stats for a column or list
def summarise_column(col):
    return {
        'Mean': np.mean(col),
        'SD': np.std(col),
        'Min': np.min(col),
        'Q25': np.percentile(col, 25),
        'Median': np.median(col),
        'Q75': np.percentile(col, 75),
        'Max': np.max(col),
        'Skew': skew(col),
        'Kurtosis': kurtosis(col)
    }

# Create summary table for each of the persistence norm columns
summary_data = {}
for col in ['L11', 'L21', 'L10', 'L20']:
    summary_data[col] = summarise_column(norms_df[col])

# === Now compute average STD and correlation across same 52-week windows ===
keywords = ['fuel prices', 'petrol prices', 'diesel prices',
            'job seekers', 'job seekers allowance', 'pensions']
X = newdf2bW2[keywords]
window_size = 52
avg_stds = []
avg_corrs = []

for i in range(len(X) - window_size):
    window = X.iloc[i:i + window_size]
    stds = window.std()
    corr_matrix = window.corr()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    avg_stds.append(stds.mean())
    avg_corrs.append(upper_triangle.stack().mean())

# Add their summaries
summary_data['Avg_STD_terms'] = summarise_column(avg_stds)
summary_data['Avg_Correlation'] = summarise_column(avg_corrs)

# Convert to DataFrame
summary_df = pd.DataFrame(summary_data).T
summary_df.index.name = 'Metric'

# (Optional) Display in notebook or export
summary_df.style.set_caption("Summary Table")
# summary_df.to_csv("norms.csv")

Unnamed: 0_level_0,Mean,SD,Min,Q25,Median,Q75,Max,Skew,Kurtosis
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
L11,5.977416,3.461017,0.930889,3.687316,5.150029,7.229589,16.790225,1.029524,0.467835
L21,7.276438,6.797778,0.515092,2.59092,4.94399,9.152585,34.348299,1.612433,2.272168
L10,282.939072,80.138487,178.580006,220.596021,245.990413,351.071883,447.530105,0.740302,-0.895939
L20,2497.671429,1277.40733,947.0,1612.25,1933.0,3699.0,5414.0,0.705189,-0.985368
Avg_STD_terms,6.8509,3.339188,3.18864,3.81908,5.004469,10.938815,11.700111,0.390993,-1.642316
Avg_Correlation,0.278083,0.118533,0.066711,0.181179,0.27706,0.367681,0.498578,-0.060607,-1.033078


## Wassertein Distances

### A: Keyword and 1 lag

In [55]:
# Stored diagrams
diagd1 = pd.DataFrame(Adiag_dim_1L2,columns=['Obs','b','d'])
diagd0 = pd.DataFrame(Adiag_dim_0L2,columns=['Obs','b','d'])

In [56]:
# Collecting
diagd150 = diagd1[(diagd1['Obs'] == 52)]
diagd150 = diagd150[['b','d']]
diagd151 = diagd1[(diagd1['Obs'] == 53)]
diagd151 = diagd151[['b','d']]

In [57]:
# Creating distances
diagd150a = diagd150.to_numpy()
diagd151a = diagd151.to_numpy()

dist = gd.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)

In [58]:
# Blank arrays
wass0 = []
wass1 = []

In [59]:
# Shape parameters
r,c = embed2.shape

In [60]:
# Collection of Wasserstein distances
r3 = r-1

for i in range(ww,r3):
 a005 = i+1
 diagd150 = diagd1[(diagd1['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd1[(diagd1['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass1.append([a005,dist])
 diagd150 = diagd0[(diagd0['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd0[(diagd0['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass0.append([a005,dist])


In [61]:
# Into dataframes:
wass0dfL2 = pd.DataFrame(wass0)
wass0dfL2.columns = ['Obs','wass0']

wass1dfL2 = pd.DataFrame(wass1)
wass1dfL2.columns = ['Obs','wass1']

In [62]:
# Merge:
wass0dfbL2 = wass0dfL2.merge(dffbL2,on='Obs')
wassdfbL2 = wass0dfbL2.merge(wass1dfL2,on='Obs')

In [63]:
# Make sure date is datetime
wassdfbL2["date"] = pd.to_datetime(wassdfbL2["date"])

# Set up the figure and primary axis
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot Dimension 0 (left y-axis)
color0 = "tab:blue"
ax1.set_xlabel("Date")
ax1.set_ylabel("Wasserstein Distance (Dim 0)", color=color0)
ax1.plot(wassdfbL2["date"], wassdfbL2["wass0"], color=color0, label="Dimension 0")
ax1.tick_params(axis='y', labelcolor=color0)

# Add secondary y-axis for Dimension 1
ax2 = ax1.twinx()
color1 = "tab:orange"
ax2.set_ylabel("Wasserstein Distance (Dim 1)", color=color1)
ax2.plot(wassdfbL2["date"], wassdfbL2["wass1"], color=color1, linestyle="--", label="Dimension 1")
ax2.tick_params(axis='y', labelcolor=color1)

# Title and formatting
fig.suptitle("Wasserstein Distances Over Time")
fig.tight_layout()
plt.xticks(rotation=45)

# Add legends
lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper right")

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/A_Wass.jpeg", format="jpeg", dpi=300)

plt.close()

## B: Keyword and four lags

In [64]:
# Stored diagrams
diagd1 = pd.DataFrame(Bdiag_dim_1L2,columns=['Obs','b','d'])
diagd0 = pd.DataFrame(Bdiag_dim_0L2,columns=['Obs','b','d'])

In [65]:
# Collecting
diagd150 = diagd1[(diagd1['Obs'] == 52)]
diagd150 = diagd150[['b','d']]
diagd151 = diagd1[(diagd1['Obs'] == 53)]
diagd151 = diagd151[['b','d']]

In [66]:
# Creating distances
diagd150a = diagd150.to_numpy()
diagd151a = diagd151.to_numpy()

dist = gd.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)

In [67]:
# Blank arrays
wass0 = []
wass1 = []

In [68]:
# Shape
r,c = embed5.shape

In [69]:
# Collection of Wasserstein distances
r3 = r-1

for i in range(ww,r3):
 a005 = i+1
 diagd150 = diagd1[(diagd1['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd1[(diagd1['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass1.append([a005,dist])
 diagd150 = diagd0[(diagd0['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd0[(diagd0['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass0.append([a005,dist])


In [70]:
# Into dataframes
wass0dfL2 = pd.DataFrame(wass0)
wass0dfL2.columns = ['Obs','wass0']

wass1dfL2 = pd.DataFrame(wass1)
wass1dfL2.columns = ['Obs','wass1']

In [71]:
# Merge
wass0dfbL2 = wass0dfL2.merge(dffbL2,on='Obs')
wassdfbL2 = wass0dfbL2.merge(wass1dfL2,on='Obs')

In [72]:
# Make sure date is datetime
wassdfbL2["date"] = pd.to_datetime(wassdfbL2["date"])

# Set up the figure and primary axis
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot Dimension 0 (left y-axis)
color0 = "tab:blue"
ax1.set_xlabel("Date")
ax1.set_ylabel("Wasserstein Distance (Dim 0)", color=color0)
ax1.plot(wassdfbL2["date"], wassdfbL2["wass0"], color=color0, label="Dimension 0")
ax1.tick_params(axis='y', labelcolor=color0)

# Add secondary y-axis for Dimension 1
ax2 = ax1.twinx()
color1 = "tab:orange"
ax2.set_ylabel("Wasserstein Distance (Dim 1)", color=color1)
ax2.plot(wassdfbL2["date"], wassdfbL2["wass1"], color=color1, linestyle="--", label="Dimension 1")
ax2.tick_params(axis='y', labelcolor=color1)

# Title and formatting
fig.suptitle("Wasserstein Distances Over Time")
fig.tight_layout()
plt.xticks(rotation=45)

# Add legends
lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper right")

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/B_Wass.jpeg", format="jpeg", dpi=300)

plt.close()

## C: Keyword and 1 related term

In [73]:
# Stored diagrams
diagd1 = pd.DataFrame(Cdiag_dim_1W2,columns=['Obs','b','d'])
diagd0 = pd.DataFrame(Cdiag_dim_0W2,columns=['Obs','b','d'])

In [74]:
# Collecting
diagd150 = diagd1[(diagd1['Obs'] == 52)]
diagd150 = diagd150[['b','d']]
diagd151 = diagd1[(diagd1['Obs'] == 53)]
diagd151 = diagd151[['b','d']]

In [75]:
# Creating distances
diagd150a = diagd150.to_numpy()
diagd151a = diagd151.to_numpy()

dist = gd.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)

dist

0.0

In [76]:
# Blank arrays
wass0 = []
wass1 = []

In [77]:
# Shape
r,c = newdf2W2.shape

# Wasserstein distances
r3 = r-1

for i in range(ww,r3):
 a005 = i+1
 diagd150 = diagd1[(diagd1['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd1[(diagd1['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass1.append([a005,dist])
 diagd150 = diagd0[(diagd0['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd0[(diagd0['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass0.append([a005,dist])


In [78]:
# Into dataframes
wass0dfW2 = pd.DataFrame(wass0)
wass0dfW2.columns = ['Obs','wass0']

wass1dfW2 = pd.DataFrame(wass1)
wass1dfW2.columns = ['Obs','wass1']

In [79]:
wass0dfaW2 = wass0dfW2.iloc[1:,]
wass1dfaW2 = wass1dfW2.iloc[1:,]

In [80]:
# Merge
wass0dfbW2 = wass0dfaW2.merge(newdf2bW2,on='Obs')
wassdfbW2 = wass0dfbW2.merge(wass1dfaW2,on='Obs')

In [81]:
# Make sure date is datetime
wassdfbW2["date"] = pd.to_datetime(wassdfbW2["date"])

# Set up the figure and primary axis
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot Dimension 0 (left y-axis)
color0 = "tab:blue"
ax1.set_xlabel("Date")
ax1.set_ylabel("Wasserstein Distance (Dim 0)", color=color0)
ax1.plot(wassdfbW2["date"], wassdfbW2["wass0"], color=color0, label="Dimension 0")
ax1.tick_params(axis='y', labelcolor=color0)

# Add secondary y-axis for Dimension 1
ax2 = ax1.twinx()
color1 = "tab:orange"
ax2.set_ylabel("Wasserstein Distance (Dim 1)", color=color1)
ax2.plot(wassdfbW2["date"], wassdfbW2["wass1"], color=color1, linestyle="--", label="Dimension 1")
ax2.tick_params(axis='y', labelcolor=color1)

# Title and formatting
fig.suptitle("Wasserstein Distances Over Time")
fig.tight_layout()
plt.xticks(rotation=45)

# Add legends
lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper right")

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/C_wass.jpeg", format="jpeg", dpi=300)

plt.close()

## D: Full word cloud

In [82]:
# Stored diagrams
diagd1 = pd.DataFrame(Ddiag_dim_1W2,columns=['Obs','b','d'])
diagd0 = pd.DataFrame(Ddiag_dim_0W2,columns=['Obs','b','d'])

In [83]:
# Collecting diagrams
diagd150 = diagd1[(diagd1['Obs'] == 52)]
diagd150 = diagd150[['b','d']]
diagd151 = diagd1[(diagd1['Obs'] == 53)]
diagd151 = diagd151[['b','d']]

In [84]:
# Creating distances
diagd150a = diagd150.to_numpy()
diagd151a = diagd151.to_numpy()

dist = gd.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)

dist

0.0

In [85]:
# Blank arrays
wass0 = []
wass1 = []

In [86]:
# Shape
r,c = newdf2W2.shape

r3 = r-1

# Wasserstein distances
for i in range(ww,r3):
 a005 = i+1
 diagd150 = diagd1[(diagd1['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd1[(diagd1['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass1.append([a005,dist])
 diagd150 = diagd0[(diagd0['Obs'] == i)]
 diagd150 = diagd150[['b','d']]
 diagd151 = diagd0[(diagd0['Obs'] == a005)]
 diagd151 = diagd151[['b','d']]
 diagd150a = diagd150.to_numpy()
 diagd151a = diagd151.to_numpy()
 dist = gudhi.hera.wasserstein_distance(diagd150a,diagd151a, order=1., internal_p=2.)
 wass0.append([a005,dist])


In [87]:
# Into dataframes
wass0dfW2 = pd.DataFrame(wass0)
wass0dfW2.columns = ['Obs','wass0']

wass1dfW2 = pd.DataFrame(wass1)
wass1dfW2.columns = ['Obs','wass1']

In [88]:
wass0dfaW2 = wass0dfW2.iloc[1:,]
wass1dfaW2 = wass1dfW2.iloc[1:,]

In [89]:
# Merge
wass0dfbW2 = wass0dfaW2.merge(newdf2bW2,on='Obs')
wassdfbW2 = wass0dfbW2.merge(wass1dfaW2,on='Obs')

In [90]:
# Make sure date is datetime
wassdfbW2["date"] = pd.to_datetime(wassdfbW2["date"])

# Set up the figure and primary axis
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot Dimension 0 (left y-axis)
color0 = "tab:blue"
ax1.set_xlabel("Date")
ax1.set_ylabel("Wasserstein Distance (Dim 0)", color=color0)
ax1.plot(wassdfbW2["date"], wassdfbW2["wass0"], color=color0, label="Dimension 0")
ax1.tick_params(axis='y', labelcolor=color0)

# Add secondary y-axis for Dimension 1
ax2 = ax1.twinx()
color1 = "tab:orange"
ax2.set_ylabel("Wasserstein Distance (Dim 1)", color=color1)
ax2.plot(wassdfbW2["date"], wassdfbW2["wass1"], color=color1, linestyle="--", label="Dimension 1")
ax2.tick_params(axis='y', labelcolor=color1)

# Title and formatting
fig.suptitle("Wasserstein Distances Over Time")
fig.tight_layout()
plt.xticks(rotation=45)

# Add legends
lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper right")

plt.savefig("/Users/alexander/Documents/GitHub/TDA-and-GoogleTrends/figs/D_wass.jpeg", format="jpeg", dpi=300)

plt.close()