In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
def resample_prices(close_prices, freq='M'):
    """
    Resample close prices for each ticker at specified frequency.
    
    Parameters
    ----------
    close_prices : DataFrame
        Close prices for each ticker and date
    freq : str
        What frequency to sample at
        For valid freq choices, see http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    
    Returns
    -------
    prices_resampled : DataFrame
        Resampled prices for each ticker and date
    """
    # TODO: Implement Function
    
    # Returning directly the data grouped by the frequency received and taking the last number that represents the
    # closing data
    return close_prices.resample(freq).last()

In [3]:
def compute_log_returns(prices):
    """
    Compute log returns for each ticker.
    
    Parameters
    ----------
    prices : DataFrame
        Prices for each ticker and date
    
    Returns
    -------
    log_returns : DataFrame
        Log returns for each ticker and date
    """
    # TODO: Implement Function
    
    # Using numpy log function to calculate the log return
    return np.log(prices)-np.log(prices.shift(1))

In [4]:
def shift_returns(returns, shift_n):
    """
    Generate shifted returns
    
    Parameters
    ----------
    returns : DataFrame
        Returns for each ticker and date
    shift_n : int
        Number of periods to move, can be positive or negative
    
    Returns
    -------
    shifted_returns : DataFrame
        Shifted returns for each ticker and date
    """
    # TODO: Implement Function

    # Returning directly the dataframe shifted the period 'shift_n'
    return returns.shift(shift_n)

In [5]:
def get_top_n(prev_returns, top_n):
    """
    Select the top performing stocks
    
    Parameters
    ----------
    prev_returns : DataFrame
        Previous shifted returns for each ticker and date
    top_n : int
        The number of top performing stocks to get
    
    Returns
    -------
    top_stocks : DataFrame
        Top stocks for each ticker and date marked with a 1
    """
    # TODO: Implement Function
    
    # First we create the empty dataframe
    top_stocks = pd.DataFrame(columns=prev_returns.columns)

    # Now we iterate for every row
    for index, row in prev_returns.iterrows():
        #and append to the empty database every old row but with only the largest numbers as data
        top_stocks.loc[index] = (row.nlargest(top_n))
    # At returns, we apply a boolean check and convert the result to integers numbers (true=1, False=0) 
    return top_stocks.notna().astype(np.int)

In [6]:
def portfolio_returns(df_long, df_short, lookahead_returns, n_stocks):
    """
    Compute expected returns for the portfolio, assuming equal investment in each long/short stock.
    
    Parameters
    ----------
    df_long : DataFrame
        Top stocks for each ticker and date marked with a 1
    df_short : DataFrame
        Bottom stocks for each ticker and date marked with a 1
    lookahead_returns : DataFrame
        Lookahead returns for each ticker and date
    n_stocks: int
        The number number of stocks chosen for each month
    
    Returns
    -------
    portfolio_returns : DataFrame
        Expected portfolio returns for each ticker and date
    """
    # TODO: Implement Function
    # df_long and df_short are already multiplied by the n_stocks, so we susbtract both datas multiply them by our 
    # returns expected data (expected_portfolio_returns_by_date) and divide that for the n_stocks
    
    return (df_long - df_short) * lookahead_returns / n_stocks

In [7]:
def analyze_alpha(expected_portfolio_returns_by_date):
    """
    Perform a t-test with the null hypothesis being that the expected mean return is zero.
    
    Parameters
    ----------
    expected_portfolio_returns_by_date : Pandas Series
        Expected portfolio returns for each date
    
    Returns
    -------
    t_value
        T-statistic from t-test
    p_value
        Corresponding p-value
    """
    # TODO: Implement Function
    # Using stats.ttest_1samp to calculate de p_value. It returns two values.
    # We stablished the series as the expected_portfolio_returns_by_date, and the null hypthesis as 0
    
    t_value, p_value = stats.ttest_1samp(expected_portfolio_returns_by_date, 0)

    return t_value, p_value/2


In [8]:
def get_high_lows_lookback(high, low, lookback_days):
    """
    Get the highs and lows in a lookback window.
    
    Parameters
    ----------
    high : DataFrame
        High price for each ticker and date
    low : DataFrame
        Low price for each ticker and date
    lookback_days : int
        The number of days to look back
    
    Returns
    -------
    lookback_high : DataFrame
        Lookback high price for each ticker and date
    lookback_low : DataFrame
        Lookback low price for each ticker and date
    """
    #TODO: Implement function
    # Just we implement the rolling function and the max() and min() value fo each
    # sticker in the dataframe.
    # To include only the working window before the current day, we just shift the
    # data one day.
    lookback_high = high.shift(1).rolling(window = lookback_days).max()
    lookback_low = low.shift(1).rolling(window = lookback_days).min()

    return lookback_high, lookback_low


In [9]:
def get_long_short(close, lookback_high, lookback_low):
    """
    Generate the signals long, short, and do nothing.
    
    Parameters
    ----------
    close : DataFrame
        Close price for each ticker and date
    lookback_high : DataFrame
        Lookback high price for each ticker and date
    lookback_low : DataFrame
        Lookback low price for each ticker and date
    
    Returns
    -------
    long_short : DataFrame
        The long, short, and do nothing signals for each ticker and date
    """
    #TODO: Implement function
    # Just some conditions and math
    long_short = -1 * (lookback_low > close) + 1 * (lookback_high < close)
    
    return long_short.astype('int64')

In [10]:
def clear_signals(signals, window_size):
    """
    Clear out signals in a Series of just long or short signals.
    
    Remove the number of signals down to 1 within the window size time period.
    
    Parameters
    ----------
    signals : Pandas Series
        The long, short, or do nothing signals
    window_size : int
        The number of days to have a single signal       
    
    Returns
    -------
    signals : Pandas Series
        Signals with the signals removed from the window size
    """
    # Start with buffer of window size
    # This handles the edge case of calculating past_signal in the beginning
    clean_signals = [0]*window_size
    
    for signal_i, current_signal in enumerate(signals):
        # Check if there was a signal in the past window_size of days
        has_past_signal = bool(sum(clean_signals[signal_i:signal_i+window_size]))
        # Use the current signal if there's no past signal, else 0/False
        clean_signals.append(not has_past_signal and current_signal)
        
    # Remove buffer
    clean_signals = clean_signals[window_size:]

    # Return the signals as a Series of Ints
    return pd.Series(np.array(clean_signals).astype(np.int), signals.index)

In [11]:
def filter_signals(signal, lookahead_days):
    """
    Filter out signals in a DataFrame.
    
    Parameters
    ----------
    signal : DataFrame
        The long, short, and do nothing signals for each ticker and date
    lookahead_days : int
        The number of days to look ahead
    
    Returns
    -------
    filtered_signal : DataFrame
        The filtered long, short, and do nothing signals for each ticker and date
    """
    #TODO: Implement function
    # First of all we will work with every column independently
    filtered_signal = signal.copy()
    for col in signal.columns:
        # For each column we will send to the clear_signals function only one kind
        # of signal, lows or high. We filter the column for lows first: checking
        # the values below zero.
        filtered_signal_low = clear_signals(signal[col]<0, lookahead_days) * -1
        
        # Then we make the same process for the high signals, or over zero
        filtered_signal_high = clear_signals(signal[col]>0, lookahead_days) * 1
        
        # Finally we change the original column for sum of both results
        filtered_signal[col] = filtered_signal_low + filtered_signal_high
    
    return filtered_signal


In [12]:
def get_lookahead_prices(close, lookahead_days):
    """
    Get the lookahead prices for `lookahead_days` number of days.
    
    Parameters
    ----------
    close : DataFrame
        Close price for each ticker and date
    lookahead_days : int
        The number of days to look ahead
    
    Returns
    -------
    lookahead_prices : DataFrame
        The lookahead prices for each ticker and date
    """
    #TODO: Implement function
    # Just shiftting the whole prices the same amount of days as lookahead_days
    lookahead_prices = close.shift(-lookahead_days)
    
    return lookahead_prices

In [13]:
def get_return_lookahead(close, lookahead_prices):
    """
    Calculate the log returns from the lookahead days to the signal day.
    
    Parameters
    ----------
    close : DataFrame
        Close price for each ticker and date
    lookahead_prices : DataFrame
        The lookahead prices for each ticker and date
    
    Returns
    -------
    lookahead_returns : DataFrame
        The lookahead log returns for each ticker and date
    """
    #TODO: Implement function
    # Just we calculate the log of lookahead_pruces and substract the 
    # log of the close prices to it.
    lookahead_returns = np.log(lookahead_prices) - np.log(close)
    
    return lookahead_returns

In [14]:
def get_signal_return(signal, lookahead_returns):
    """
    Compute the signal returns.
    
    Parameters
    ----------
    signal : DataFrame
        The long, short, and do nothing signals for each ticker and date
    lookahead_returns : DataFrame
        The lookahead log returns for each ticker and date
    
    Returns
    -------
    signal_return : DataFrame
        Signal returns for each ticker and date
    """
    #TODO: Implement function
    # We just implement a multiplication of the signals by each return for those
    # days ahead.
    signal_return = signal * lookahead_returns
    
    
    return signal_return

In [15]:
from scipy.stats import kstest


def calculate_kstest(long_short_signal_returns):
    """
    Calculate the KS-Test against the signal returns with a long or short signal.
    
    Parameters
    ----------
    long_short_signal_returns : DataFrame
        The signal returns which have a signal.
        This DataFrame contains two columns, "ticker" and "signal_return"
    
    Returns
    -------
    ks_values : Pandas Series
        KS static for all the tickers
    p_values : Pandas Series
        P value for all the tickers
    """
    #TODO: Implement function
    # First we create a copy of the original returns data
    lssr = long_short_signal_returns.copy()
    
    # To apply de Kolmogorov test we need to rescale the data.
    # To rescale the data we substract the mean and divide every
    # return by the global standard deviation.
    lssr['signal_return'] = (lssr['signal_return']-\
                             lssr['signal_return'].mean())/\
                            np.std(lssr['signal_return'],ddof=0) 
    
    # grouping the data per tickers.
    lssr_group = lssr.groupby('ticker').signal_return
    
    # Initializing the list for ne
    ks_values, p_values, indexes = [],[],[]
    
    for index, series in lssr_group:
        ks, p = kstest(series, 'norm' )
        indexes.append(index)
        ks_values.append(ks)
        p_values.append(p)
    
    return pd.Series(ks_values, index=indexes), pd.Series(p_values, index=indexes)

In [16]:
def find_outliers(ks_values, p_values, ks_threshold, pvalue_threshold=0.05):
    """
    Find outlying symbols using KS values and P-values
    
    Parameters
    ----------
    ks_values : Pandas Series
        KS static for all the tickers
    p_values : Pandas Series
        P value for all the tickers
    ks_threshold : float
        The threshold for the KS statistic
    pvalue_threshold : float
        The threshold for the p-value
    
    Returns
    -------
    outliers : set of str
        Symbols that are outliers
    """
    #TODO: Implement function
    # Creating the initial set
    outliers = set()
    
    # Now check for every ticker their ks and p values and compare them with
    # the tresholds received. In case positive we add the ticker to our set.
    for ticker in ks_values.index:
        if (ks_values.loc[ticker] > ks_threshold) and (p_values.loc[ticker] < pvalue_threshold):
            outliers.add(ticker)
    
    # Final step
    return outliers

In [18]:
def generate_dollar_volume_weights(close, volume):
    """
    Generate dollar volume weights.

    Parameters
    ----------
    close : DataFrame
        Close price for each ticker and date
    volume : str
        Volume for each ticker and date

    Returns
    -------
    dollar_volume_weights : DataFrame
        The dollar volume weights for each ticker and date
    """
    assert close.index.equals(volume.index)
    assert close.columns.equals(volume.columns)
    
    #TODO: Implement function
    # We directly return the resulting dataframe:
    # We multiply both data frame to have the total amount traded each day,
    # then divide all the dataframe for the sum of each row or
    # the total amount traded for all the tickers the same day.
    
    clo_vol = close * volume

    return clo_vol.div((clo_vol).sum(axis=1), axis=0)

In [19]:
def calculate_dividend_weights(dividends):
    """
    Calculate dividend weights.

    Parameters
    ----------
    dividends : DataFrame
        Dividend for each stock and date

    Returns
    -------
    dividend_weights : DataFrame
        Weights for each stock and date
    """
    #TODO: Implement function
    # We directly return the resulting dataframe:
    # We apply the cumulative sum of dividends for each ticker during time,
    # then divide all this new cumulative dividends dataframe by
    # the total amount of dividends each day
    div_cumsum = dividends.cumsum()

    return div_cumsum.div(div_cumsum.sum(axis=1),axis=0)

In [20]:
def generate_returns(prices):
    """
    Generate returns for ticker and date.

    Parameters
    ----------
    prices : DataFrame
        Price for each ticker and date

    Returns
    -------
    returns : Dataframe
        The returns for each ticker and date
    """
    #TODO: Implement function
    # As we did before we just divide the daily price by the 
    # the price on the day before, less one.

    return (prices/prices.shift(1) - 1)

In [21]:
def generate_weighted_returns(returns, weights):
    """
    Generate weighted returns.

    Parameters
    ----------
    returns : DataFrame
        Returns for each ticker and date
    weights : DataFrame
        Weights for each ticker and date

    Returns
    -------
    weighted_returns : DataFrame
        Weighted returns for each ticker and date
    """
    assert returns.index.equals(weights.index)
    assert returns.columns.equals(weights.columns)
    
    #TODO: Implement function
    # Just return the dataframes multiplied
    # every daily return for every ticker by
    # every weight of every ticker.

    return returns * weights


In [22]:
def calculate_cumulative_returns(returns):
    """
    Calculate cumulative returns.

    Parameters
    ----------
    returns : DataFrame
        Returns for each ticker and date

    Returns
    -------
    cumulative_returns : Pandas Series
        Cumulative returns for each date
    """
    #TODO: Implement function
    # First we cumulate the returns per day for all the tickers,
    # then we add 1 to use it to multiply each daily return by
    # each other.
    
    return (returns.sum(axis=1)+1).cumprod(axis=0)

In [23]:
def tracking_error(benchmark_returns_by_date, etf_returns_by_date):
    """
    Calculate the tracking error.

    Parameters
    ----------
    benchmark_returns_by_date : Pandas Series
        The benchmark returns for each date
    etf_returns_by_date : Pandas Series
        The ETF returns for each date

    Returns
    -------
    tracking_error : float
        The tracking error
    """
    assert benchmark_returns_by_date.index.equals(etf_returns_by_date.index)
    
    #TODO: Implement function
    # Just implement the function exactly as it is with ddof = 1
    # for the sample std

    return np.sqrt(252)*(np.std(etf_returns_by_date-benchmark_returns_by_date, ddof=1))

In [24]:
def get_covariance_returns(returns):
    """
    Calculate covariance matrices.

    Parameters
    ----------
    returns : DataFrame
        Returns for each ticker and date

    Returns
    -------
    returns_covariance  : 2 dimensional Ndarray
        The covariance of the returns
    """
    #TODO: Implement function
    # Implementing the numpy covariance matrix function,
    # with only the argument that each series is column base
    
    return np.cov(returns.fillna(0), rowvar=False)

In [25]:
import cvxpy as cvx

def get_optimal_weights(covariance_returns, index_weights, scale=2.0):
    """
    Find the optimal weights.

    Parameters
    ----------
    covariance_returns : 2 dimensional Ndarray
        The covariance of the returns
    index_weights : Pandas Series
        Index weights for all tickers at a period in time
    scale : int
        The penalty factor for weights the deviate from the index 
    Returns
    -------
    x : 1 dimensional Ndarray
        The solution for x
    """
    assert len(covariance_returns.shape) == 2
    assert len(index_weights.shape) == 1
    assert covariance_returns.shape[0] == covariance_returns.shape[1]  == index_weights.shape[0]

    #TODO: Implement function
    # checking the number of stocks
    m = len(index_weights)
    
    # Creating the X variable vector to be optimize
    x = cvx.Variable(m)
    
    # the portfolio variance, in quadratic form
    portfolio_variance = cvx.quad_form(x, covariance_returns)
    
    # Pythagorean theorem (L2 norm) between portfolio and index weights
    distance_to_index = cvx.norm(x-index_weights)
    
    # Creating our objective to minimize portfolio variance and the distance 
    # of the portfolio weights from the index weights, and the scale constant
    # at the same time.
    objective = cvx.Minimize(portfolio_variance + scale * distance_to_index)
    
    # Defining the list of constraints
    constraints = [x >= 0, sum(x) == 1]
    
    # Now use cvxpy to solve the problem and find the objective
    problem = cvx.Problem(objective, constraints)
    min_val = problem.solve()
    
    # Finally return the x series with the values
    return x.value

ModuleNotFoundError: No module named 'cvxpy'

In [26]:
def rebalance_portfolio(returns, index_weights, shift_size, chunk_size):
    """
    Get weights for each rebalancing of the portfolio.

    Parameters
    ----------
    returns : DataFrame
        Returns for each ticker and date
    index_weights : DataFrame
        Index weight for each ticker and date
    shift_size : int
        The number of days between each rebalance
    chunk_size : int
        The number of days to look in the past for rebalancing

    Returns
    -------
    all_rebalance_weights  : list of Ndarrays
        The ETF weights for each point they are rebalanced
    """
    assert returns.index.equals(index_weights.index)
    assert returns.columns.equals(index_weights.columns)
    assert shift_size > 0
    assert chunk_size >= 0
    
    #TODO: Implement function
    # We male a loop to generate every list of new X weights.
    # Our first day will be the chunk_size to guarantee that we have
    # enough data to send. We'll jump every shift_size in the loop.
    # We then send the covariance of the chunk in returns, and our index
    # will be the day we are working for in the loop.
       
    return [get_optimal_weights(get_covariance_returns(returns.iloc[a-chunk_size:a,:]), 
                                  index_weights.iloc[a-1,:]) for a in range(chunk_size, len(returns), shift_size)]




In [27]:
def get_portfolio_turnover(all_rebalance_weights, shift_size, rebalance_count, n_trading_days_in_year=252):
    """
    Calculage portfolio turnover.

    Parameters
    ----------
    all_rebalance_weights : list of Ndarrays
        The ETF weights for each point they are rebalanced
    shift_size : int
        The number of days between each rebalance
    rebalance_count : int
        Number of times the portfolio was rebalanced
    n_trading_days_in_year: int
        Number of trading days in a year

    Returns
    -------
    portfolio_turnover  : float
        The portfolio turnover
    """
    assert shift_size > 0
    assert rebalance_count > 0
    
    #TODO: Implement function
    # First we convert the Ndarray of weights to a Dataframe
    all_weight_dataf = pd.DataFrame(all_rebalance_weights)
    
    # Later we calculate the absolute difference between weights
    # one day and the next day. We add all of them and multiply this total by
    # the rebalance events per year. Finally we divide it by the rebalance_count
    return  (abs(all_weight_dataf-all_weight_dataf.shift(-1)).sum().sum()) * \
                (n_trading_days_in_year / shift_size) / rebalance_count