# Datapreprocessing
* From CSV to Feather
* Drop NA values

In [7]:
# Install packages using pip in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pyarrow
!{sys.executable} -m pip install python-louvain

Collecting python-louvain
  Downloading python-louvain-0.14.tar.gz (19 kB)
Building wheels for collected packages: python-louvain
  Building wheel for python-louvain (setup.py): started
  Building wheel for python-louvain (setup.py): finished with status 'done'
  Created wheel for python-louvain: filename=python_louvain-0.14-py3-none-any.whl size=9294 sha256=15434e6ae7922dc512c8df456dcea691385bdcc96e1e4ed4b356c064752a6015
  Stored in directory: c:\users\newbi\appdata\local\pip\cache\wheels\22\f9\ce\591ffa9b16851da50ca337c9ecfd44d79a7b87fcbd2a7a0021
Successfully built python-louvain
Installing collected packages: python-louvain
Successfully installed python-louvain-0.14


In [8]:
# this block does not need to be run if the feather file already exists
#usual lines
import numpy as np
import pandas as pd

# import path
input_path = 'Data/us-equities_logreturns.csv'
output_path = 'Data/us-equities_logreturns.feather'

#load data
df = pd.read_csv(input_path)

#delete columns with NaN values
df_clean = df.dropna(axis=1, how="any", thresh=None, subset=None, inplace=False)
df_clean.drop(columns="Unnamed: 0",inplace=True)

# keep track of how many columns we have lost
columns_losses = df_clean.shape[1] - df.shape[1]

#write to feather 
df_clean.to_feather(output_path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


# Louvain clustering for correlation matrices


In [10]:
from numpy import linalg as LA
import numpy as np
import pandas as pd
import math
import networkx as nx
import community


def compute_C_minus_C0(lambdas,v,lambda_plus):
    N=len(lambdas)
    C_clean=np.zeros((N, N))
  
    # _s stands for _structure below. Note that range(N-1) means that we do not include the maximum eigenvalue
    for i in range(N-1):
        if lambdas[i]>lambda_plus:
            C_clean=C_clean + lambdas[i]*(np.outer(v[:,i],v[:,i]))
    np.fill_diagonal(C_clean,1)
    return C_clean    
    
    
def LouvainCorrelationClustering(R):   # R is a matrix of return
    N=R.shape[1]
    T=R.shape[0]

    q=N*1./T
    lambda_plus=(1.+np.sqrt(q))**2

    C=R.corr()
    lambdas, v = LA.eigh(C)
    
    C_s=compute_C_minus_C0(lambdas,v,lambda_plus)
    C_s=np.abs(C_s)
    
    mygraph= nx.from_numpy_matrix(C_s)
    partition = community.community_louvain.best_partition(mygraph)

    DF=pd.DataFrame.from_dict(partition,orient="index")
    return(DF)

In [30]:
# Define clustering Assets
R=pd.read_feather("Data/us-equities_logreturns.feather")     
myclusteringAssets=LouvainCorrelationClustering(R)
myclusteringAssets.index=R.columns
print("Assets:\n",myclusteringAssets)
print("\nAssets values are:",np.unique(myclusteringAssets.values))

Assets:
      0
AEP  0
FL   1
MRO  2
DTE  0
IP   3
..  ..
SWM  8
ITT  7
MSM  0
HIG  2
BCO  4

[481 rows x 1 columns]

Assets values are: [0 1 2 3 4 5 6 7 8]


# Define a local period and compare the assets 
* R.shape  = (4549, 481) 
* each row = one day
* after a week t0 = 0 and t1= 7, we have decend numbers.
* do we have to keep tracks like 0

In [29]:
# set the time period
t0=1 #starting period
t1=51 #50 days on memory
Rlocal=R[t0:t1].copy()
myclusteringDays=LouvainCorrelationClustering(Rlocal)
myclusteringDays.index=Rlocal.columns
myclusteringDays.columns=["state"]
string_states = ','.join(map(str, myclusteringDays['state']))
print("List of states:\n",myclusteringDays)
print("\nUnique state values are:",np.unique(myclusteringDays.values))
print("\nThe string of states from t0:",t0,"and t1:",t1,"is:\n",string_states)

List of states:
      state
AEP      0
FL       1
MRO      1
DTE      0
IP       2
..     ...
SWM      2
ITT      3
MSM      1
HIG      2
BCO      1

[481 rows x 1 columns]

Unique state values are: [0 1 2 3]

The string of states from t0: 1 and t1: 51 is:
 0,1,1,0,2,0,3,0,2,1,3,3,0,2,3,3,2,1,3,0,3,2,1,3,1,1,0,2,1,1,0,3,2,0,1,3,0,3,2,2,2,1,0,1,1,3,0,1,0,3,0,1,2,2,3,2,1,3,1,0,0,0,1,2,2,0,2,1,1,2,3,2,0,3,1,0,0,3,2,3,1,2,1,0,0,0,3,2,1,0,0,3,2,3,2,0,0,1,0,3,3,2,3,2,1,2,0,0,2,0,2,2,2,3,3,0,0,3,1,3,3,1,3,1,1,1,0,2,1,3,3,1,2,2,0,3,2,0,0,3,3,3,0,1,1,0,3,1,2,1,2,2,2,2,2,1,1,3,0,2,3,2,1,2,3,1,3,2,2,1,0,0,0,0,1,2,3,1,0,3,2,0,3,2,1,3,2,1,2,0,0,2,0,1,0,0,2,2,3,0,1,2,2,2,1,1,3,2,1,3,1,3,2,1,2,1,0,2,2,3,0,3,1,2,1,1,2,2,0,2,3,1,1,2,1,0,3,2,0,0,1,2,2,3,2,1,0,2,3,0,1,1,1,3,0,1,2,1,3,0,2,1,1,3,2,1,2,0,1,1,0,0,1,3,2,2,3,1,3,2,3,0,1,3,1,0,2,0,2,3,1,1,1,3,1,1,1,1,1,0,2,3,2,2,3,3,2,1,2,0,3,0,3,3,1,1,0,0,3,1,1,1,0,2,2,3,1,1,0,2,0,3,0,1,3,1,0,2,2,1,0,3,0,1,3,2,2,2,2,2,3,1,2,2,3,0,1,3,3,2,2,2,2,0,3,2,1,3,1,2,2,