In [None]:
import numpy as np
import pandas as pd

In [None]:
data_0 = pd.read_csv('../Data/imputed_data_new_3.csv')

In [None]:
data_0.isna().sum().sum()

In [None]:
# Sort by datadate

data_0['datadate'] = pd.to_datetime(data_0['datadate'])
data_0 = data_0.sort_values(by='datadate').reset_index(drop=True)

In [None]:
ratings = sorted(list(data_0["pastrating"].unique()))
ratings_to_idx = {rating: i for i, rating in enumerate(ratings)}
data_0["pastrating"] = data_0["pastrating"].apply(lambda x: ratings_to_idx[x])

In [None]:
data_0.head()

In [None]:
synth_tics = set(data_0["tic"].unique()) - set(data_0[data_0["year"].isin(range(2010, 2018))]["tic"].unique())
len(synth_tics)

In [None]:
data_before_2018 = data_0[data_0['year'] < 2018]
data_after_2017 = data_0[data_0['year'] >= 2018]
data_after_2017[data_after_2017['tic'].isin(synth_tics)][['tic', 'year', 'quarter', 'pastrating']]

In [None]:
# Duplicates?

data_0.duplicated().sum()

In [None]:
# Get binary columns

binary_cols = data_0.columns[data_0.nunique() == 2]
binary_cols

In [None]:
# Get object columns

object_cols = data_0.select_dtypes(include='object').columns
object_cols

In [None]:
# Print all the dtypes

data_0.dtypes

In [None]:
# Convert int64 columns to float64

int_cols = data_0.select_dtypes(include='int64').columns
data_0[int_cols] = data_0[int_cols].astype('float64')

In [None]:
data_0.dtypes

In [None]:
# Check for extreme values

data_0.describe()

In [None]:
# Standardize all columns except binary columns, object columns, datadate, year, quarter, pastrating

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

data_1 = data_0.copy()

data_1 = data_1.drop(columns=binary_cols)
data_1 = data_1.drop(columns=object_cols)
data_1 = data_1.drop(columns=['datadate', 'year', 'quarter', 'pastrating'])

data_1 = pd.DataFrame(scaler.fit_transform(data_1), columns=data_1.columns)
data_1 = pd.concat([data_0[binary_cols], data_0[object_cols], data_0[['datadate', 'year', 'quarter', 'pastrating']], data_1], axis=1)

In [None]:
data_1.head()

In [None]:
# Check for duplicates

data_1.duplicated().sum()

In [None]:
# Remove duplicates

data_1 = data_1.drop_duplicates()

In [None]:
# save as csv

# data_1.to_csv("../Data/standardized_data_new.csv", index=False)

In [None]:
dtypes = {
    "date": "str",
    "TICKER": "str",
    "PRC": "float"
}

cols = list(dtypes.keys())

stock_data_original = pd.read_csv("../Data/stock_data.csv", dtype=dtypes,  usecols=cols, engine="c", parse_dates=["date"])

In [None]:
stock_data_original.head()

In [None]:
stock_data_original["TICKER"].value_counts()

In [None]:
len(set(data_1["tic"].unique()) - set(stock_data_original["TICKER"].unique()))

In [None]:
data_1["tic"].value_counts()

In [None]:
stock_data_1 = stock_data_original.copy()

In [None]:
stock_data_1 = stock_data_1.dropna()

In [None]:
# Remove tics that are not in data_1

stock_data_1 = stock_data_1[stock_data_1["TICKER"].isin(data_1["tic"].unique())]

In [None]:
# to datetime

stock_data_1["date"] = pd.to_datetime(stock_data_1["date"])

In [None]:
stock_data_1 = stock_data_1.rename(columns={"TICKER": "tic", "date": "dater"})

In [None]:
# Create year and quarter columns

stock_data_1["year"] = stock_data_1["dater"].dt.year
stock_data_1["quarter"] = stock_data_1["dater"].dt.quarter

In [None]:
# Sort by date

stock_data_1 = stock_data_1.sort_values(by="dater").reset_index(drop=True)

In [None]:
stock_data_1.head()

In [None]:
 # Check if all quarters are are present for each year for each tic

for year in range(2010, 2021):
    for tic in data_1["tic"].unique():
        quarters = data_1[(data_1['year'] == year) & (data_1['tic'] == tic)]['quarter'].unique()
        if len(quarters) != 4:
            print(f"Year: {year}, Tic: {tic}, quarters: {data_1[(data_1['year'] == year) & (data_1['tic'] == tic)]['quarter'].unique()}")

In [None]:
 # Check if all quarters are are present for each year for each tic

for year in range(2010, 2021):
    for tic in stock_data_1["tic"].unique():
        quarters = stock_data_1[(stock_data_1['year'] == year) & (stock_data_1['tic'] == tic)]['quarter'].unique()
        if len(quarters) != 4:
            print(f"Year: {year}, Tic: {tic}, quarters: {stock_data_1[(stock_data_1['year'] == year) & (stock_data_1['tic'] == tic)]['quarter'].unique()}")

In [None]:
stock_data = stock_data_1.copy()

In [None]:
import pandas as pd
import numpy as np

# Assuming stock_data_1 is defined elsewhere and contains the necessary data
stock_data = stock_data_1.copy()

# Ensure 'dater' is a datetime column
stock_data['dater'] = pd.to_datetime(stock_data['dater'])

# Sort the DataFrame by date and ticker
stock_data.sort_values(['dater', 'tic'], inplace=True)

# Assuming 'PRC' is the column for which you want to calculate the standard deviation
# Create a 'year_quarter' column to help in grouping
stock_data['year_quarter'] = stock_data['year'].astype(str) + 'Q' + stock_data['quarter'].astype(str)

# Initialize a dictionary to store the results
sps_dict = {}

# Function to calculate rolling standard deviation for 365 days with a 30-day window
def calculate_rolling_std(df):
    # Ensure there are no duplicate dates
    df = df.drop_duplicates(subset="dater", keep="first")
    # Fill in missing dates (if any), assuming stock data is daily
    df = df.set_index('dater').asfreq('D').reset_index()
    # Forward fill the missing values in 'PRC'
    df['PRC'] = df['PRC'].ffill()
    # Calculate the 30-day rolling standard deviation
    df['PRC_std_30'] = df['PRC'].rolling(window=30, min_periods=1).std()
    return df['PRC_std_30']

# Apply the function to each group and store the result in sps_dict
for (tic, year_quarter), group in stock_data.groupby(['tic', 'year_quarter']):
    sps_dict.setdefault(year_quarter, {})[tic] = calculate_rolling_std(group).to_numpy()

# Now, sps_dict has the structure: { tic: { year_quarter: array_of_365_day_std_dev, ... }, ... }

# Example on how to access the data:
# print(sps_dict['AAPL']['2021Q1'])


In [None]:
# import pandas as pd
# import numpy as np
# 
# stock_data = stock_data_1.copy()
# 
# # Ensure 'dater' is a datetime column
# stock_data['dater'] = pd.to_datetime(stock_data['dater'])
# 
# # Sort the DataFrame by date and ticker
# stock_data.sort_values(['dater', 'tic'], inplace=True)
# 
# # Assuming 'PRC' is the column for which you want to calculate the standard deviation
# # Create a 'year_quarter' column to help in grouping
# stock_data['year_quarter'] = stock_data['year'].astype(str) + 'Q' + stock_data['quarter'].astype(str)
# 
# # Initialize a dictionary to store the results
# sps_dict = {}
# 
# # Function to calculate rolling standard deviation and keep only the last 30 values
# def calculate_rolling_std(df):
#     # drop duplicates
#     df = df.drop_duplicates(subset="dater", keep="first")
#     # sort by dater
#     df = df.sort_values(by="dater")
#     df = df.reset_index(drop=True)
#     df = df.set_index('dater').asfreq("D", method="ffill").reset_index()
#     df['PRC_std_30'] = df['PRC'].rolling(window=30, min_periods=1).std()
#     return df.iloc[-30:]['PRC_std_30']  # Assuming we want to keep the last 30 days of std dev
# 
# # Apply the function to each group and store the result in sps_dict
# for (tic, year_quarter), group in stock_data.groupby(['tic', 'year_quarter']):
#     #sps_dict[year_quarter][tic] = calculate_rolling_std(group).to_numpy()
#     sps_dict.setdefault(year_quarter, {})[tic] = calculate_rolling_std(group).to_numpy()
# 
# # Now, sps_dict has the structure: { (tic, year_quarter): Series_of_30_day_std_dev, ... }
# 
# # Example on how to access the data:
# # print(sps_dict[('AAPL', '2021Q1')])


In [None]:
# for key in sps_dict.keys():
#     for tic in sps_dict[key].keys():
#         if np.isnan(sps_dict[key][tic]).any():
#             print(f"Key: {key}, Tic: {tic}")
#             print(sps_dict[key][tic])

In [None]:
for key in sps_dict.keys():
    for tic in sps_dict[key].keys():
        array = sps_dict[key][tic]
        nan_indices = np.where(np.isnan(array))[0]  # Find indices of NaN values
        for idx in nan_indices:
            if idx < len(array) - 1:  # Check if there's a next value available
                array[idx] = array[idx + 1]  # Replace NaN with the next value

In [None]:
for key in sps_dict.keys():
    for tic in sps_dict[key].keys():
        if np.isnan(sps_dict[key][tic]).any():
            print(f"Key: {key}, Tic: {tic}")
            print(sps_dict[key][tic])

In [None]:
# Replace nan with zeros

for key in sps_dict.keys():
    for tic in sps_dict[key].keys():
        sps_dict[key][tic] = np.nan_to_num(sps_dict[key][tic])

In [None]:
# # Check if all tics are present in sps_dict
# 
# for key in sps_dict.keys():
#     for tic in data_1["tic"].unique():
#         if tic not in sps_dict[key]:
#             print(f"Key: {key}, Tic: {tic}")

In [None]:
for key in sps_dict.keys():
    for tic in data_1["tic"].unique():
        if tic not in sps_dict[key]:
            sps_dict[key][tic] = np.zeros(30)

In [None]:
for key in sps_dict.keys():
    for tic in data_1["tic"].unique():
        if tic not in sps_dict[key]:
            print(f"Key: {key}, Tic: {tic}")

In [None]:
sps_dict_copy = sps_dict.copy()

In [None]:
synth_tics = set(data_1["tic"].unique()) - set(data_1[data_1["year"].isin(range(2010, 2018))]["tic"].unique())
len(synth_tics)

In [None]:
import dcor
import itertools

# Dictionary to store dcorr values for each pair of companies on each date
dense_dependence_matrix = {}
i = 0
n = len(sps_dict_copy)

tics = data_1["tic"].unique()

for date, tickers in sps_dict_copy.items():
    print(f"Date: {date}, {i}/{n}")

    # unique_tickers = list(tickers.keys())
    # Use itertools to generate unique pairs of tickers
    ticker_pairs = list(itertools.combinations(tics, 2))
    
    year = int(date[:4])
    quarter = int(date[-1])

    for tic1, tic2 in ticker_pairs:
        
        if (tic1 in synth_tics or tic2 in synth_tics) and year < 2018:
            dense_dependence_matrix.setdefault(date, {})[f"{tic1} {tic2}"] = 0.0
            dense_dependence_matrix.setdefault(date, {})[f"{tic2} {tic1}"] = 0.0
            continue
        
        # Get the closing price standard deviations for tic1 and tic2 on date
        X = sps_dict_copy[date][tic1]
        Y = sps_dict_copy[date][tic2]

        # Pad the shorter array with the first element on the left side
        len_diff = len(Y) - len(X)

        if len_diff > 0:
            X = np.pad(X, (len_diff, 0), mode='constant', constant_values=X[0])
        elif len_diff < 0:
            Y = np.pad(Y, (-len_diff, 0), mode='constant', constant_values=Y[0])

        # Calculate the distance covariance
        X = X.reshape(-1, 1)
        Y = Y.reshape(-1, 1)

        dcorr = dcor.distance_correlation(X, Y)

        # Check if dcov is nan
        if np.isnan(dcorr):
            print(f"Distance covariance between {tic1} and {tic2} on {date} is nan")
        else:
            dense_dependence_matrix.setdefault(date, {})[f"{tic1} {tic2}"] = dcorr
            dense_dependence_matrix.setdefault(date, {})[f"{tic2} {tic1}"] = dcorr

    i += 1

In [None]:
len(stock_data["tic"].unique()), len(data_1["tic"].unique())

In [None]:
data_before_2018 = data_1[data_1['year'] < 2018]
data_after_2017 = data_1[data_1['year'] >= 2018]
data_after_2017[data_after_2017['tic'].isin(synth_tics)][['tic', 'year', 'quarter', 'pastrating']]

In [None]:
import numpy as np

# Assuming dense_dependence_matrix is already populated

# Create a dictionary to store dependency matrices for each date along with tic indices
dependency_matrices = {}
tics = stock_data["tic"].unique()
tic_to_index = {tic: i for i, tic in enumerate(tics)}
num_companies = len(tics)
n = len(dense_dependence_matrix)
print(tic_to_index)
# Fill the dictionary with dependency matrices for each date
for i, date in enumerate(dense_dependence_matrix):
    print(f"Date: {date}, {i}/{n}")
    # Extract unique company symbols from the dense_dependence_matrix for the current date
    # tics = set(tic for pair in dense_dependence_matrix[date] for tic in pair.split())
    # tics = data_3["tic"].unique()

    # Create an empty NumPy array to store dcorr values
    
    dependence_array = np.full((num_companies, num_companies), np.nan)

    # Fill the array with dcorr values
    for tics, dcorr in dense_dependence_matrix[date].items():
        tic1, tic2 = tics.split()
        #print(tic1, tic2)
        index1, index2 = tic_to_index[tic1], tic_to_index[tic2]
        dependence_array[index1, index2] = dcorr
        dependence_array[index2, index1] = dcorr  # Since it's a symmetric matrix

    # Replace NaN values on the diagonal with 1
    np.fill_diagonal(dependence_array, 1)

    # Store the dependency matrix and tic indices for the current date
    dependency_matrices[date] = {'matrix': dependence_array, 'tic_indices': tic_to_index}


In [None]:
import pickle

def save_obj_pickle(obj, name):
    with open("../Data/" + name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
# save_obj_pickle(dependency_matrices, "dependency_matrices_new_final")

In [None]:
dependency_matrices["2022Q4"]

In [None]:
# Check what years and quarters are in data_1

data_1["year"].unique(), data_1["quarter"].unique()

In [None]:
# Remove everything after 2020Q4

dependency_matrices = {k: v for k, v in dependency_matrices.items() if k <= "2020Q4"}

In [None]:
dependency_matrices.keys()

In [None]:
for date, matrix_info in dependency_matrices.items():
    for tic, idx in matrix_info['tic_indices'].items():
        print(f"Date: {date}, Tic: {tic}, Index: {idx}")

In [None]:
len(data_1["tic"].unique())

In [None]:
data_1[(data_1['year'] == 2018) & (data_1['quarter'] == 1) & (data_1['tic'] == "ET")]

In [None]:
import networkx as nx
from itertools import combinations

# Assuming dependency_matrices is already populated

# Create a dictionary to store graphs for each date
dependency_graphs = {}
i = 1
n = len(dependency_matrices)
tics = data_1["tic"].unique()

# Create graphs for each date
for date, matrix_info in dependency_matrices.items():
    print(f"Date: {date}, {i}/{n}")

    G = nx.Graph()
    
    year = int(date[:4])    # YYYYQN format
    quarter = int(date[-1]) # YYYYQN format
    
    # Batch processing of node data for all tics
    node_data = data_1[(data_1['year'] == year) & (data_1['quarter'] == quarter) & data_1['tic'].isin(tics)]
    # (318, 204)

    tic_to_index = matrix_info['tic_indices']
    
    # Add nodes with company symbols and node attributes
    for tic, idx in matrix_info['tic_indices'].items():
        if tic in synth_tics and year < 2018:
            # Create a synthetic tic_node_data with the same structure as the real node_data where all values are 0 except pastrating = -1, year, quarter, tic
            tic_node_data = pd.DataFrame(np.zeros((1, len(data_1.columns)), dtype=float), columns=data_1.columns)
            tic_node_data['tic'] = tic
            tic_node_data['pastrating'] = -1.0
            tic_node_data['year'] = year
            tic_node_data['quarter'] = quarter
        elif tic in synth_tics and node_data[(node_data['year'] == year) & (node_data['quarter'] == quarter) & (node_data['tic'] == tic)].empty:
            tic_node_data = pd.DataFrame(np.zeros((1, len(data_1.columns)), dtype=float), columns=data_1.columns)
            tic_node_data['tic'] = tic
            tic_node_data['pastrating'] = -1.0
            tic_node_data['year'] = year
            tic_node_data['quarter'] = quarter
        else:
            tic_node_data = node_data[node_data['tic'] == tic]
        if not tic_node_data.empty:
            node_attribute_value = tic_node_data.iloc[-1].copy()
            node_attribute_value = node_attribute_value.drop(['datadate'])
            G.add_node(idx)
            nx.set_node_attributes(G, {idx: node_attribute_value})

    # Add edges based on non-zero values in the matrix
    for tic1, tic2 in combinations(tics, 2):
        index1, index2 = tic_to_index[tic1], tic_to_index[tic2]
        if matrix_info['matrix'][index1, index2] != 0:  # Assuming 0 represents no connection
            G.add_edge(index1, index2, weight=matrix_info['matrix'][index1, index2])
            # Add edge attributes
            G[index1][index2]['weight'] = matrix_info['matrix'][index1, index2]

    # Store the graph for the current date
    dependency_graphs[date] = G
    i += 1

In [None]:
for date, graph in dependency_graphs.items():
    print(f"Date: {date}")
    year = graph.nodes[0]["year"]
    if year >= 2018:
        for node in graph.nodes:
            try:
                if graph.nodes[node]["tic"] in synth_tics:
                    if graph.nodes[node]['pastrating'] != -1.0:
                        print(f"Node: {node}, TIC: {graph.nodes[node]['tic']}, Rating: {graph.nodes[node]['pastrating']}")
                        print(f"Tic to index: {tic_to_index[graph.nodes[node]['tic']]}")
                        print()
            except KeyError:
                print("KEY ERROR")
                print(f"Node: {node}, Features: {graph.nodes[node]}")
                print()

In [None]:
sample_date = list(dependency_graphs.keys())[0]
sample_graph = dependency_graphs[sample_date]
pos = nx.spring_layout(sample_graph)
nx.draw_networkx(sample_graph, with_labels=True, pos=pos)

In [None]:
# import matplotlib.pyplot as plt
# 
# sample_graph_copy = sample_graph.copy()
# # remove nodes with no edges
# sample_graph_copy.remove_nodes_from(list(nx.isolates(sample_graph_copy)))
# fig, ax = plt.subplots(figsize=(80, 80))
# # pos = nx.spring_layout(sample_graph_copy)
# pos = nx.circular_layout(sample_graph_copy, scale=1)
# nx.draw_networkx(sample_graph_copy, with_labels=False, pos=pos)

In [None]:
from tmfg_corr import tmfg
import matplotlib.pyplot as plt

# Create a TMFG graph object
tmfg_graph = tmfg(dependency_matrices[sample_date]["matrix"])

# Visualize the maximum spanning tree
# pos = nx.spring_layout(tmfg_graph)
# nx.draw_networkx(tmfg_graph, with_labels=True, pos=pos)

fig, ax = plt.subplots(figsize=(20, 20))
pos = nx.spring_layout(tmfg_graph)
nx.draw_networkx(tmfg_graph, with_labels=True, pos=pos, ax=ax)
plt.show()

In [None]:
import pickle

def save_obj_pickle(obj, name):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

save_obj_pickle(tmfg_graph, "tmfg_graph_new_3")
# save_obj_pickle(tic_to_index, "tic_to_index")
# save_obj_pickle(sample_graph_copy, "sample_graph_copy")

In [None]:
# Get node attributes and edge attributes for the sample graph

node_attributes = nx.get_node_attributes(sample_graph, 'tic')

node_attributes

In [None]:
dependency_graphs["2010Q1"].nodes[0]

In [None]:
edge_attributes = nx.get_edge_attributes(sample_graph, 'weight')
edge_attributes

In [None]:
# Apply the TMFG algorithm to each graph in the dependency_graphs dictionary

# Create a dictionary to store the TMFG graphs for each date
tmfg_graphs = {}
i = 1
n = len(dependency_graphs)
# Apply the TMFG algorithm to each graph
for date, matrix in dependency_matrices.items():
    print(f"Date: {date}, {i}/{n}")
    if len(matrix) > 3:
        G = tmfg(matrix)
    else:
        G = dependency_graphs[date]

    tmfg_graphs[date] = G
    i += 1

In [None]:
len(tmfg_graphs)

In [None]:
# How many nodes are in each graph

for date, graph in tmfg_graphs.items():
    print(f"Date: {date}, Number of nodes: {len(graph.nodes)}")

In [None]:
for date, graph in tmfg_graphs.items():
    print(f"Date: {date}, Number of nodes: {len(graph.nodes)}")
    for node in graph.nodes:
        if len(graph.nodes[node]) != 204:
            print(node, len(graph.nodes[node]))
    print()

In [None]:
type(tmfg_graphs["2020Q3"].nodes[99].copy())

In [None]:
nx.set_node_attributes(tmfg_graphs["2020Q4"], {99: tmfg_graphs["2020Q3"].nodes[99].copy()})

In [None]:
tmfg_graphs["2020Q4"].nodes[99]

In [None]:
# What tic is 84?

tic_to_index = dependency_matrices["2020Q4"]["tic_indices"]
tic_to_index

In [None]:
# What nodes are missing in 2018Q1, 2018Q2, 2018Q3?

for date, graph in tmfg_graphs.items():
    if date in ["2018Q1", "2018Q2", "2018Q3"]:
        tics_in_graph = set([graph.nodes[node]['tic'] for node in graph.nodes])
        missing_tics = set(data_1[data_1['year'] == 2018]['tic'].unique()) - tics_in_graph
        if missing_tics:
            print(f"Date: {date}, Missing tics: {missing_tics}")

In [None]:
# How many edges are in each graph

for date, graph in tmfg_graphs.items():
    print(f"Date: {date}, Number of edges: {len(graph.edges)}")

In [None]:
tmfg_graphs["2010Q1"].nodes[0]

In [None]:
all_tics = set(data_1["tic"].unique())
len(all_tics)

In [None]:
tmfg_graphs["2010Q1"].nodes[0]['tic']

In [None]:
# Check if all tics are present in the TMFG graphs

for date, graph in tmfg_graphs.items():
    print(graph.nodes)
    tics_in_graph = set([graph.nodes[node]['tic'] for node in graph.nodes])
    missing_tics = all_tics - tics_in_graph
    if missing_tics:
        print(f"Date: {date}, Missing tics: {missing_tics}")

In [None]:
for date, graph in tmfg_graphs.items():
    print(f"Date: {date}")
    year = graph.nodes[0]["year"]
    if year >= 2018:
        for node in graph.nodes:
            if graph.nodes[node]["tic"] in synth_tics:
                if graph.nodes[node]['pastrating'] != -1.0:
                    print(f"Node: {node}, TIC: {graph.nodes[node]['tic']}, Rating: {graph.nodes[node]['pastrating']}")
                    print(f"Tic to index: {tic_to_index[graph.nodes[node]['tic']]}")
                    print()

In [None]:
import pickle

def save_obj_pickle(obj, name):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj_pickle(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
tics = stock_data["tic"].unique()
tic_to_index = {tic: i for i, tic in enumerate(tics)}

In [None]:
# Save the TMFG graphs

save_obj_pickle(tmfg_graphs, "../Data/tmfg_graphs_new_4")
# save_obj_pickle(tic_to_index, "../Data/tic_to_index")
# save_obj_pickle(synth_tics, "../Data/synth_tics")