In [149]:
import pandas as pd
import numpy as np
import openpyxl

In [150]:
df = pd.read_csv("/home/bethtian/fintech545/FinTech545_Spring2025/Projects/Project01/problem2.csv")
df_info = df.describe()
df_info

Unnamed: 0,x1,x2,x3,x4,x5
count,37.0,39.0,35.0,35.0,37.0
mean,0.399609,0.379761,0.416253,0.488401,0.265485
std,1.212635,1.118963,1.128018,1.347022,1.181603
min,-2.679666,-2.750918,-2.841926,-2.845942,-2.628937
25%,-0.36465,-0.18967,-0.230137,-0.095817,-0.368145
50%,0.700845,0.570253,0.564582,0.74101,0.475927
75%,0.931806,0.946296,1.102426,1.141382,1.037638
max,3.185588,3.415043,2.865721,3.256499,2.645745


# A. Calculate the pairwise covariance matrix of the data.

In [151]:
cov_matrix = df.cov()
cov_matrix.to_excel("/home/bethtian/fintech545/beth-fintech545/Project01/Problem_2_a.xlsx")
cov_matrix

Unnamed: 0,x1,x2,x3,x4,x5
x1,1.470484,1.454214,0.877269,1.903226,1.444361
x2,1.454214,1.252078,0.539548,1.621918,1.237877
x3,0.877269,0.539548,1.272425,1.171959,1.091912
x4,1.903226,1.621918,1.171959,1.814469,1.589729
x5,1.444361,1.237877,1.091912,1.589729,1.396186


# B. Is the Matrix at least positive semi-definite? Why?

In [152]:
def CheckSemi(matrix):
    #check all values are great or equal to 0
    if not np.all(matrix >= 0):
        print("The matrix is not positive semi-definite, because at least one value in the matrix is less than 0.")
        return False
    # Check the symmetry
    if not np.allclose(matrix,matrix.T):
        print("The matrix is not positive semi-definite, because the matrix is not symmetric")
        return False
    #calculate the eigen values
    eigenvalues = np.linalg.eigvals(matrix)
    if not np.all(eigenvalues >= -1e-10):
        print("The matrix is not positive semi-definite, because at least one value in eigen vectors is less than 0. ")
        return eigenvalues
    return True

In [153]:
CheckSemi(cov_matrix)

The matrix is not positive semi-definite, because at least one value in eigen vectors is less than 0. 


array([ 6.78670573,  0.83443367, -0.31024286,  0.02797828, -0.13323183])

Because there are some eigen values of the covariance matrix less than 0, we cannot prove the matrix is a semi-definite matrix.

# C. If not, find the nearest positive semi-definite matrix using Higham’s method and the near-psd method of Rebenato and Jackel.

## C.1 Higham's Method

In [154]:
from numpy import linalg as LA

In [155]:
def compute_gamma(A, C):
    return np.linalg.norm(A - C, 'fro')

def project_U(A, W):
    W_sqrt = np.sqrt(W)
    W_inv_sqrt = 1 / W_sqrt
    
    B = W_sqrt[:, None] * A * W_sqrt
    
    eigenvalues, eigenvectors = np.linalg.eigh(B)

    eigenvalues = np.maximum(eigenvalues, 0)

    B = eigenvectors @ np.diag(eigenvalues) @ eigenvectors.T


    return W_inv_sqrt[:, None] * B * W_inv_sqrt



def project_S(A, original_diag):
    P = A.copy()
    np.fill_diagonal(P, original_diag)
    return P


In [156]:

def weighted_nearest_correlation_matrix(A, W=None, max_iterations=100, tol=1e-5, debug=False):
    A = np.asarray(A)
    n = A.shape[0]
    original_diag = np.diag(A).copy()
    C = A.copy()  
    
    if W is None:
        W = np.ones(n)
    else:
        W = np.asarray(W)
    
    DS = np.zeros_like(A)
    Y = A.copy()
    prev_gamma = float('inf')

    for k in range(max_iterations):
        if debug:
            print(f"\nIteration {k+1}")
        
        R = Y - DS
        X = project_S(R, original_diag)
        DS = X - R
        Y = project_U(X, W)

        current_gamma = compute_gamma(Y, C)

        gamma_diff = abs(current_gamma - prev_gamma)
        if debug:
            print(f"Current gamma: {current_gamma}")
            print(f"Gamma difference: {gamma_diff}")
        
        if gamma_diff < tol:
            break
            
        prev_gamma = current_gamma
    
    return Y

In [158]:
Y = weighted_nearest_correlation_matrix(cov_matrix)
df_Y = pd.DataFrame(Y)
df_Y.to_excel('/home/bethtian/fintech545/beth-fintech545/Project01/Problem_2_c_Higham.xlsx')
print("\nThe nearest PSD matrix is:")
print(Y)
print("\nIs it a PSD matrix?", CheckSemi(Y))


The nearest PSD matrix is:
[[1.47049433 1.33384879 0.8983436  1.62994852 1.40333022]
 [1.33384879 1.2520842  0.64015263 1.46036423 1.2239064 ]
 [0.8983436  0.64015263 1.27242641 1.07081103 1.06033915]
 [1.62994852 1.46036423 1.07081103 1.81447807 1.57655974]
 [1.40333022 1.2239064  1.06033915 1.57655974 1.39619756]]

Is it a PSD matrix? True


## C.2 Rebenato and Jackel Method

In [159]:
def near_psd(a, epsilon=0.0):
    if isinstance(a, pd.DataFrame):
        a = a.to_numpy()
    
    n = a.shape[0]
    inv_sd = None
    out = a.copy()
    
    if not np.allclose(np.diag(out), np.ones(n)):
        inv_sd = np.diag(1.0 / np.sqrt(np.diag(out)))
        out = inv_sd @ out @ inv_sd

    vals, vecs = np.linalg.eigh(out)
    vals = np.maximum(vals, epsilon)
    
    T = 1.0 / np.sqrt(np.sum(vecs * vecs * vals, axis=1))
    T = np.diag(T)
    
    L = np.diag(np.sqrt(vals))
    
    B = T @ vecs @ L
    out = B @ B.T

    if inv_sd is not None:
        inv_sd = np.diag(1.0 / np.diag(inv_sd))
        out = inv_sd @ out @ inv_sd
    
    return out

In [160]:
near_cov_matrix = near_psd(cov_matrix)
df_near_cov_matrix = pd.DataFrame(near_cov_matrix)
df_near_cov_matrix .to_excel('/home/bethtian/fintech545/beth-fintech545/Project01/Problem_2_c_R_J.xlsx')
df_near_cov_matrix 

Unnamed: 0,0,1,2,3,4
0,1.470484,1.327009,0.842583,1.624464,1.364833
1,1.327009,1.252078,0.555421,1.433109,1.165906
2,0.842583,0.555421,1.272425,1.052789,1.060424
3,1.624464,1.433109,1.052789,1.814469,1.544993
4,1.364833,1.165906,1.060424,1.544993,1.396186


In [161]:
CheckSemi(near_cov_matrix)

True

#  D. Calculate the covariance matrix using only overlapping data.

In [162]:
overlap_df = df.dropna()
overlap_df

Unnamed: 0,x1,x2,x3,x4,x5
7,0.560968,0.570253,0.622116,0.570309,0.417396
21,0.804735,0.961988,1.010609,0.879315,1.037638
31,0.778427,0.626604,0.745432,0.688,0.614246
33,0.931806,1.242272,1.099453,1.022245,1.008071
34,0.905442,0.771031,1.070813,0.809276,0.846771
35,1.888583,1.745463,1.980359,2.160339,1.922086
43,0.250574,0.189652,0.164797,0.238005,0.198233
45,-0.388712,-0.319076,-0.225369,-0.080384,-0.396024


In [163]:
overlap_cov_matrix = overlap_df.cov()
overlap_cov_matrix.to_excel('/home/bethtian/fintech545/beth-fintech545/Project01/Problem_2_d.xlsx')
overlap_cov_matrix

Unnamed: 0,x1,x2,x3,x4,x5
x1,0.418604,0.394054,0.424457,0.416382,0.434287
x2,0.394054,0.396786,0.409343,0.398401,0.422631
x3,0.424457,0.409343,0.44136,0.428441,0.448957
x4,0.416382,0.398401,0.428441,0.437274,0.440167
x5,0.434287,0.422631,0.448957,0.440167,0.466272


#  E. Compare the results of the covariance matrices in C and D. Explain the differences.

## E.1 Covariance matrices in C

In [164]:
print("The covariance matrix in C (Rebenato and Jackel) is: ")
df_near_cov_matrix


The covariance matrix in C (Rebenato and Jackel) is: 


Unnamed: 0,0,1,2,3,4
0,1.470484,1.327009,0.842583,1.624464,1.364833
1,1.327009,1.252078,0.555421,1.433109,1.165906
2,0.842583,0.555421,1.272425,1.052789,1.060424
3,1.624464,1.433109,1.052789,1.814469,1.544993
4,1.364833,1.165906,1.060424,1.544993,1.396186


In [165]:
print("The covariance matrix in C (Higham) is: ")
df_Y

The covariance matrix in C (Higham) is: 


Unnamed: 0,0,1,2,3,4
0,1.470494,1.333849,0.898344,1.629949,1.40333
1,1.333849,1.252084,0.640153,1.460364,1.223906
2,0.898344,0.640153,1.272426,1.070811,1.060339
3,1.629949,1.460364,1.070811,1.814478,1.57656
4,1.40333,1.223906,1.060339,1.57656,1.396198


## E.2 Covariance matrices in D

In [166]:
print("The covariance matrix in D is: ")
overlap_cov_matrix


The covariance matrix in D is: 


Unnamed: 0,x1,x2,x3,x4,x5
x1,0.418604,0.394054,0.424457,0.416382,0.434287
x2,0.394054,0.396786,0.409343,0.398401,0.422631
x3,0.424457,0.409343,0.44136,0.428441,0.448957
x4,0.416382,0.398401,0.428441,0.437274,0.440167
x5,0.434287,0.422631,0.448957,0.440167,0.466272


**Explanation**

The values in two psd matrices is similar to each other, ranging 1.0 from 1.8. The PSD matrices show greater variation in their diagonal elements. The values vary with each other. The The vaues in overlapping covariance matrix is relative small, only ranging 0.39 from 0.47. It has relatively close diagonal elements. The values are more similar within the matrix.

The overlapping data only capture relationships with the overlapping window, lack lots of information. This lead to the underestimation for the true covariance values. 