# Library

In [1]:
import numpy as np
import pandas as pd
import random 
import tensorflow as tf
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Data

In [2]:
def load_data(path = 'sp500_final.xlsx') :
    
    """
    This function load the dataset
    """
    
    df = pd.read_excel(path)
    # drop the column with date
    df.drop('Unnamed: 0', axis = 1, inplace = True)
    return df

In [3]:
df_SP500 = load_data()
print('The dataset have ' + str(df_SP500.shape[0]) + " lines " + str(df_SP500.shape[1]) + " columns")

The dataset have 1259 lines 477 columns


In [4]:
def process_data(df = df_SP500) :
    """
    Process the dataframe
    """
    # Replace null value by the previous one
    df= df.fillna(method='ffill')
    # Replace null value by 0
    #df.fillna(0, inplace = True)
    df.dropna(axis=1, inplace=True)
    df_SP500.dropna(axis=1, inplace = True)
    # Change the type of SP500 to numeric
    df['SP500'] = pd.to_numeric(df['SP500'],errors='coerce')
    return df

In [5]:
df_SP500 = process_data()

In [6]:
df_SP500.head(5)

Unnamed: 0,SP500,MMM,AOS,ABT,ABBV,ABMD,ACN,ATVI,ADBE,AAP,...,WMB,WYNN,XEL,XRX,XLNX,XYL,YUM,ZBRA,ZBH,ZION
0,1462.42,84.319835,15.253797,28.796655,29.375119,13.69,61.870948,10.445002,38.34,71.832049,...,26.360638,102.828047,23.082288,25.542926,32.872246,25.553543,43.440158,40.96,64.964546,21.255891
1,1459.37,84.221975,15.015346,29.892814,29.132557,13.45,61.646973,10.507033,37.75,71.832049,...,26.159234,103.9341,23.014845,25.542926,32.810206,25.469914,43.711821,41.0,65.895354,21.284889
2,1466.47,84.844721,15.178248,29.713116,28.7652,13.41,61.987415,10.659724,38.13,72.951792,...,26.445847,105.188206,23.099148,25.258721,32.349339,25.432745,44.190464,40.6716,66.231211,21.893857
3,1461.89,84.951478,15.010624,29.955709,28.823081,13.2495,61.718645,10.669267,37.94,72.704061,...,26.329652,105.545279,22.854668,25.685028,32.216397,25.098226,43.912334,40.9,66.394347,21.78753
4,1457.15,84.960374,15.296293,29.964694,28.195765,13.21,62.077006,10.487947,38.14,71.514953,...,26.004308,105.454095,22.89682,25.649503,31.560547,24.791583,42.068908,40.93,66.490305,21.352552


In [7]:
def split_df(df, target = 'SP500'):
    """
    Inputs : Dataframe and the name of the target column set by default to SP550
    Output : dictionary of 4 DataFrames with keys : x_train,y_train,  x_test, y_test
    
    """
    
    # Create features and target
    x = df.drop(target, axis = 1)
    y = df[[target]].values

    # Echantillon de test (20%) et train (80%)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2 , random_state = 100, shuffle=False)
    
    return {'x_train' : x_train, 'x_test' : x_test, 'y_train' : y_train, 'y_test' : y_test}

In [8]:
# for example if we want get x_train 
x_train = split_df(df_SP500)['x_train']

# Select randomly K assets from index assets

In [9]:
# We fix the seed in order to have the same result
np.random.seed(100)
# We choose randomly 50 assets
random_columns = np.random.choice(df_SP500.columns[1:], size=50, replace=False)
# We get the value of these fifty assets
df_random_asset1_ = df_SP500[random_columns]

# selected sparse portfolio's asset returns
df_random_asset1 = df_random_asset1_.pct_change()

# Add the target columns SP500 in the dataframe with 50 assets
df_random_asset1['SP500'] = df_SP500['SP500']

In [10]:
# Drop the first line with NaN valus
df_random_asset = df_random_asset1[1 :]

In [11]:
df_random_asset.head(5)

Unnamed: 0,RJF,UHS,FISV,STX,DISCA,GOOGL,CBOE,MO,HBI,CMS,...,CAG,JPM,SYY,SLB,DLR,AVY,FLS,MS,TGT,SP500
1,-0.003502,0.003019,-0.007505,-0.017797,0.000764,0.000581,0.013276,0.000615,0.016297,0.002821,...,-0.002326,-0.002015,-0.004389,-0.006723,-0.004072,0.007046,0.002039,-0.002039,0.022781,1459.37
2,0.026857,0.00321,0.000372,0.002186,0.008393,0.01976,0.023583,-0.000614,0.025988,0.008842,...,0.007328,0.017725,0.002834,0.015369,0.003505,0.019871,0.008009,0.031154,0.006981,1466.47
3,0.002444,0.0242,0.00632,0.001008,-0.002421,-0.004363,0.00288,-0.002459,-0.008892,-0.011952,...,-0.002315,0.001102,-0.002826,0.002777,-0.001601,0.006586,0.001368,-0.019316,0.011885,1461.89
4,-0.001219,-0.018746,0.004802,-0.002786,0.029278,-0.001973,-0.00351,-0.009858,-0.007885,-0.001613,...,-0.000663,0.001982,-0.010076,0.004155,-0.001603,-0.019357,0.008325,-0.007576,-0.010767,1457.15
5,0.011475,0.011144,0.015196,0.066446,-0.001326,0.00657,0.011848,-0.007467,-0.00274,0.002019,...,0.008292,-0.000659,-0.017812,-0.004275,0.007299,0.006394,-0.002451,-0.001527,-0.007586,1461.02


In [12]:
# Split the dataset into train (80%) and test(20%)
x_train = split_df(df_random_asset)['x_train']
print(f'shape of x_train :{x_train.shape}')

y_train = split_df(df_random_asset)['y_train']
print(f'shape of y_train :{y_train.shape}')

x_test = split_df(df_random_asset)['x_test']
print(f'shape of x_test :{x_test.shape}')

y_test = split_df(df_random_asset)['y_test']
print(f'shape of y_test :{y_test.shape}')

shape of x_train :(1006, 50)
shape of y_train :(1006, 1)
shape of x_test :(252, 50)
shape of y_test :(252, 1)


# Loss Function

In [15]:
rb = tf.convert_to_tensor(np.array(split_df(df_random_asset)['y_train']), dtype = tf.float32)

x = tf.convert_to_tensor(np.array(split_df(df_random_asset)['x_train']), dtype = tf.float32)



def f(w, x = x ,rb = rb ):
    """
        This function compute the loss
        input :
            x : 2 D arrays 
            rb : 1D array
            w -> 1D arrays dim K = 50
        output : loss
    """
     
    
    loss = tf.constant(0 , dtype = tf.float32) # Inilizing loss function
    L = x.shape[0] 
    lam = 0.0001

    for i in range(L) :
        loss += ((rb - tf.matmul(x, tf.transpose(w)))**2 + lam*tf.matmul(w, tf.transpose(w)))
    return np.sum(loss)

In [16]:
# initialize weight w
w = tf.Variable(initial_value = [[0.02]*50], dtype = tf.float32)
#w = tf.convert_to_tensor(np.array([[0.2]*50]), dtype = tf.float64)
#w = tf.Variable(tf.ones([1, 50], dtype=tf.float64))

f(w)

3820138300000.0

# Gradient descent ???

In [None]:
N = 200

learning_rate = 0.1

optimizer = tf.keras.optimizers.SGD(learning_rate)


w = tf.Variable(initial_value = [[0.2]*50], dtype = tf.float32, trainable=True)
#x1=tf.Variable(4, dtype=tf.float32,trainable=True)


for n in range(N):

    with tf.GradientTape() as tape:

        value=f(w)

        gradient=tape.gradient(value,[w])

        optimizer.apply_gradients(zip(gradient,[w]))
        
print(f'w={w.numpy()}')

In [None]:
w = tf.Variable(initial_value = [[0.5]*50], dtype = tf.float32)
w

# Model

We seek to determine the weights w such that :

$ f(w) = \sum_{k=1}^{K} w_k X_{kt} = W^T X = X_{t}^{b} $


In [90]:
class Model:
    def __init__(self, learning_rate = 0.001, iters = 20):
        self.learning_rate = learning_rate
        self.iters = iters
        self.weights = None
        self.w = tf.Variable(initial_value = [[0.2]*50], dtype = tf.float32)
        
    def __call__(self, X):
        x = tf.convert_to_tensor(X, dtype = tf.float32)
        y_est = tf.matmul(self.w , tf.transpose(x))
        
        return y_est
    
    
    
    def loss_funct(w, x,rb ):
        """
            This function compute the loss
            input :
                x : 2 D arrays (dependant variable)
                rb : 1D array (y_predite)
                w -> 1D arrays dim K = 50 (weights)
            output : loss
        """
        loss = tf.constant(0 , dtype = tf.float32) # Inilizing loss function 
        L = x.shape[0] # L = 50 Numbers of assets
        lam = 0.0001



        for i in range(L) :
            loss += ((rb - tf.matmul(x, tf.transpose(w)))**2 + lam*tf.matmul(w, tf.transpose(w)))
        return np.sum(loss)
    
model = Model()

In [91]:
model(x)

<tf.Tensor: shape=(1, 1006), dtype=float32, numpy=
array([[ 0.00326998,  0.10093625, -0.02280224, ...,  0.02953928,
        -0.08914218,  0.00653147]], dtype=float32)>

In [93]:
def train(model, inputs, outputs, learning_rate):
    y_true = tf.convert_to_tensor(output)
    
    
    #GradientTape
    with tf.GrdientTape() as g:
        y_pred = model(inputs)
        current_loss = loss_func(y_true, y_pred)
        
        dw = g.gradient(current_loss, [model.w])
        # update
        model.w.assign_sub(dw*learning_rate)

In [94]:
model(x)

<tf.Tensor: shape=(1, 1006), dtype=float32, numpy=
array([[ 0.00326998,  0.10093625, -0.02280224, ...,  0.02953928,
        -0.08914218,  0.00653147]], dtype=float32)>

In [None]:
class model:
    def __init__(self, learning_rate = 0.001, iters = 20):
        self.learning_rate = learning_rate
        self.iters == iters
        self.weights = None
        
    def fit(self, X, y):
        # shape of the matix X
        n_samples, n_features = X.shape
        
        # init weights
        weights = np.zeros(n_features)
        
    
    def predict(self, X):
        output = np.dot(X, self.weights)
        return output

In [None]:
N = 200

learning_rate = 0.1

optimizer = tf.keras.optimizers.SGD(learning_rate)

w = np.array([[4.0]*50]) # Inilizing the weights w as array of dim len x_train


for n in range(N):

    with tf.GradientTape() as tape:

        value=fu(w)

        gradient=tape.gradient(value,[w])

        optimizer.apply_gradients(zip(gradient,[w]))
        
