In [158]:
import DataHub as hub

In [159]:
# Import numpy and pandas 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import calendar

# If the observations are in a dataframe, you can use statsmodels.formulas.api to do the regression instead
from statsmodels import regression

from sklearn.linear_model import LinearRegression
linreg = LinearRegression()

In [213]:
name_list=['BetaExp', 'DividendYieldExp',
       'EarningsQualityExp', 'EarningsYieldExp', 'GrowthExp', 'LeverageExp',
       'LiquidityExp', 'LongTermReversalExp', 'ManagementQualityExp',
       'MidCapitalizationExp', 'MomentumExp', 'ProfitabilityExp',
       'ProspectExp', 'SizeExp', 'ValueExp', 'ResidualVolatilityExp',
       'AirlinesExp', 'AluminumSteelExp', 'ApparelandTextilesExp',
       'AutomobilesandComponentsExp', 'BanksExp', 'BeveragesTobaccoExp',
       'BiotechnologyLifeSciencesExp', 'BuildingProductsExp', 'ChemicalsExp',
       'CommercialandProfessionalServicesExp', 'CommunicationsEquipmentExp',
       'ComputersElectronicsExp', 'ConstructionMaterialsExp',
       'ConstructionandEngineeringExp', 'ConstructionandFarmMachineryExp',
       'ContainersandPackagingExp', 'DistributorsMultilineRetailExp',
       'DiversifiedFinancialsExp', 'DiversifiedTelecommunicationServicesExp',
       'ElectricUtilitiesExp', 'ElectricalEquipmentExp', 'FoodProductsExp',
       'FoodandStaplesRetailingExp', 'GasUtilitiesExp',
       'HealthCareEquipmentandTechnologyExp', 'HealthCareProvidersExp',
       'HomebuildingExp', 'HotelsLeisureandConsumerServicesExp',
       'HouseholdDurablesExp', 'HouseholdandPersonalProductsExp',
       'IndustrialConglomeratesExp', 'IndustrialMachineryExp',
       'InsuranceBrokersandReinsuranceExp', 'InternetSoftwareandITServicesExp',
       'InternetandCatalogRetailExp',
       'LeisureProductsTextilesApparelandLuxuryExp',
       'LifeHealthandMultilineInsuranceExp', 'ManagedHealthCareExp',
       'MediaExp', 'MultiUtilitiesWaterUtilitiesPowerExp',
       'OilGasandConsumableFuelsExp', 'OilandGasDrillingExp',
       'OilandGasEquipmentandServicesExp',
       'OilandGasExplorationandProductionExp', 'PaperandForestProductsExp',
       'PharmaceuticalsExp', 'PreciousMetalsGoldMiningExp', 'RealEstateExp',
       'RestaurantsExp', 'RoadandRailExp', 'SemiconductorEquipmentExp',
       'SemiconductorsExp', 'SoftwareExp', 'SpecialtyChemicalsExp',
       'SpecialtyRetailExp', 'SpecialtyStoresExp',
       'TradingCompaniesandDistributorsExp',
       'TransportationAirFreightandMarineExp',
       'WirelessTelecommunicationServicesExp']
style_factor=['BetaExp', 'DividendYieldExp',
       'EarningsQualityExp', 'EarningsYieldExp', 'GrowthExp', 'LeverageExp',
       'LiquidityExp', 'LongTermReversalExp', 'ManagementQualityExp',
       'MidCapitalizationExp', 'MomentumExp', 'ProfitabilityExp',
       'ProspectExp', 'SizeExp', 'ValueExp', 'ResidualVolatilityExp']
N18='return_shift'

In [237]:

# use eigenvalue1 to store the first eigen of residuals in each month        
eigenvalue1=[]
# use eigenvalue2 to store the first eigen of raw return in each month
eigenvalue2=[]
# use yymm to store the year and month for drawing data
yymm=[]

# 
eigenvalue_lack_factor=pd.DataFrame()

for year in range(2018,2019):
    for month in range(3,4):
        yymm.append(year*100+month)
        mrange=calendar.monthrange(year,month)
        start_date= '%d%02d%02d'%(year,month,1)
        end_date = '%d%02d%02d'%(year,month,mrange[1])
        # load monthly data
        loadings = h.read('DailyFactor', start=start_date, end=end_date)
        # count(distinct(date)) to get the number of available dates, N
        Ndays=loadings['date'].unique().shape[0]
        # count(*) group by TICKER_US to get a table with number of records (dates) for each ticker
        loadings_ndays=loadings[['date', 'ticker']].groupby('ticker').count().reset_index()
        tickers=loadings_ndays[loadings_ndays['date']==Ndays]['ticker']
        # Filter the table above by selecting the ticker with number of records (days) equal to N
        loadings=loadings.loc[loadings['ticker'].isin(list(tickers))]
        # Note that the Return data stored in risklab has left out the percentage
        loadings['ReturnPct']=loadings['ReturnPct']*0.01
        
        # shifting tomorrow's return to today's row
        loadings['return_shift'] = loadings.groupby('ticker')['ReturnPct'].shift(-1)
        loadings=loadings.dropna()
        # set multiindex       
        loadings=loadings.set_index(['date', 'ticker']).sort_index()
        # draw out the timestamp for every month
        M=loadings.index.get_level_values(0)
        Timestamp = []
        for i in M:
            if not i in Timestamp:
                Timestamp.append(i)



        residual = pd.DataFrame()
        for i in range(len(Timestamp)):
            Xi = pd.DataFrame()
            # For each month, we load 'retun_shift' 
            Yi = np.array((loadings.loc[(Timestamp[i],slice(None)),'return_shift']).reset_index(drop=True))

            for j in range(len(name_list)):
                # For each month, we load all the exposures and store them as a dataframe. The columns of the dataframe are the regressors we will use.
                Xij = np.array((loadings.loc[(Timestamp[i],slice(None)),name_list[j]]).reset_index(drop=True))
                Xi[j] = Xij
            # linear regression
            linreg.fit(Xi, Yi)
            residuali = np.array(Yi - linreg.predict(Xi))
            residual[i]=residuali
        residualT=residual.T
        residualT=residualT.fillna(method='ffill')
        residual=residualT.T
        P= np.dot(residualT.values,residual.values)
        P=P/(residualT.shape[1])
        eigenvalue12_1,featurevector=np.linalg.eig(P)
        eigenvalue12_1=sorted(list(eigenvalue12_1),reverse=True)
        eigenvalue1.append(np.max(eigenvalue12_1))

        # Construct a dataframe only for return
        Ri=pd.DataFrame()
        for i in range(len(Timestamp)):
            Rij = np.array((loadings.loc[(Timestamp[i],slice(None)),N18]).reset_index(drop=True))
            Ri[i]=Rij
        Return_shift=Ri.T
        Return_shift=Return_shift.fillna(method='ffill')
        Return_shiftT=Return_shift.T
        Q= np.dot(Return_shift.values,Return_shiftT.values)
        Q = Q/(Ri.shape[0])
        eigenvalue12_2,featurevector2=np.linalg.eig(Q)
        eigenvalue12_2=sorted(list(eigenvalue12_2),reverse=True)
        eigenvalue2.append(np.max(eigenvalue12_2))
       
    
        eigenvalue_lack_factor_array=[]
        for k in range(16):
            # Remove one style factor at a time
            name_list_lack_factor=name_list[:k]+name_list[k+1:]
            residual = pd.DataFrame()
            for i in range(len(Timestamp)):
                Xi = pd.DataFrame()
                Yi = np.array((loadings.loc[(Timestamp[i],slice(None)),N18]).reset_index(drop=True))

                for j in range(len(name_list_lack_factor)):
                    Xij = np.array((loadings.loc[(Timestamp[i],slice(None)),name_list_lack_factor[j]]).reset_index(drop=True))
                    Xi[j] = Xij
                linreg.fit(Xi, Yi)
                residuali = Yi - linreg.predict(Xi)
                residual[i]=residuali
            residualT=residual.T
            residualT=residualT.fillna(method='ffill')
            residual=residualT.T
            P= np.dot(residualT.values,residual.values)
            P=P/(residualT.shape[1])
            eigenvalue12_1_lack_factor,featurevector=np.linalg.eig(P)
            eigenvalue12_1_lack_factor=sorted(list(eigenvalue12_1_lack_factor),reverse=True)
            eigenvalue_lack_factor_array.append(np.max(eigenvalue12_1_lack_factor))
        
        for m in len(yymm):
            eigenvalue_lack_factor[m]=np.array(eigenvalue_lack_factor_array)

            
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


TypeError: 'int' object is not iterable

In [231]:
eigenvalue1

[0.0005326890526634271, 0.0006236050052613793]

In [232]:
eigenvalue2

[0.003255472085076872, 0.00216137135929158]

In [233]:
eigenvalue_lack_factor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.000627,0.000624,0.000624,0.000624,0.000624,0.000624,0.000626,0.000624,0.000624,0.000626,0.000627,0.000624,0.000624,0.000624,0.000623,0.000628


In [None]:
residual_eigens=eigenvalue1
residual_eigens_mom=eigenvalue1_mom
residual_eigens_str=eigenvalue1_str
residual_eigens_size=eigenvalue1_size
residual_eigens_value=eigenvalue1_value
residual_eigens_vol=eigenvalue1_vol
residual_eigens_beta=eigenvalue1_beta
return_eigens=eigenvalue2

In [None]:
import matplotlib.pyplot as plt

date = pd.period_range("2006-01", freq="M", periods=156)
df1 = pd.DataFrame({"date":date, "first_eigens" : np.array(return_eigens)})
df2 = pd.DataFrame({"date":date, "first_eigens" : np.array(residual_eigens)})
df2_mom = pd.DataFrame({"date":date, "first_eigens" : np.array(residual_eigens_mom)})
df2_str = pd.DataFrame({"date":date, "first_eigens" : np.array(residual_eigens_str)})
df2_size = pd.DataFrame({"date":date, "first_eigens" : np.array(residual_eigens_size)})
df2_value = pd.DataFrame({"date":date, "first_eigens" : np.array(residual_eigens_value)})
df2_vol = pd.DataFrame({"date":date, "first_eigens" : np.array(residual_eigens_vol)})
df2_beta = pd.DataFrame({"date":date, "first_eigens" : np.array(residual_eigens_beta)})

df1['eigen_class'] = 'return_eigens'
df2_mom['eigen_class'] = 'residual_eigens_lack_mom'
df2_str['eigen_class'] = 'residual_eigens_lack_str'
df2_size['eigen_class'] = 'residual_eigens_lack_size'
df2_value['eigen_class'] = 'residual_eigens_lack_value'
df2_vol['eigen_class'] = 'residual_eigens_lack_vol'
df2_beta['eigen_class'] = 'residual_eigens_lack_beta'
df2['eigen_class'] = 'residual_eigens_complete'

df = pd.concat([df1,df2,df2_mom,df2_str,df2_beta,df2_value,df2_vol,df2_size])
x_col='date'
y_col = 'first_eigens'
fig = plt.figure(figsize=(50,4))
fig.add_subplot(1,1,1)
sns.pointplot(x=x_col,y=y_col,data=df,hue='eigen_class')

In [None]:
contribution=np.mean(ratio, axis=0)
print('The average contribution of %s is %.5f'% ('size', contribution[0]))
print('The average contribution of %s is %.5f'% ('mom', contribution[1]))
print('The average contribution of %s is %.5f'% ('str', contribution[2]))
print('The average contribution of %s is %.5f'% ('value', contribution[3]))
print('The average contribution of %s is %.5f'% ('vol', contribution[4]))
print('The average contribution of %s is %.5f'% ('beta', contribution[5]))


In [None]:

date = pd.period_range("2006-01", freq="M", periods=156)
df_size = pd.DataFrame({"date":date, "variance_ratio" : [i[0] for i in ratio]})
df_mom = pd.DataFrame({"date":date, "variance_ratio" : [i[1] for i in ratio]})
df_str = pd.DataFrame({"date":date, "variance_ratio" : [i[2] for i in ratio]})
df_value = pd.DataFrame({"date":date, "variance_ratio" : [i[3] for i in ratio]})
df_vol = pd.DataFrame({"date":date, "variance_ratio" : [i[4] for i in ratio]})
df_beta = pd.DataFrame({"date":date, "variance_ratio" : [i[5] for i in ratio]})

df_size['contribution'] = 'size_variance_ratio_contribution'
df_mom['contribution'] = 'mom_variance_ratio_contribution'
df_str['contribution'] = 'str_variance_ratio_contribution'
df_value['contribution'] = 'value_variance_ratio_contribution'
df_vol['contribution'] = 'vol_variance_ratio_contribution'
df_beta['contribution'] = 'beta_variance_ratio_contribution'


df = pd.concat([df_size, df_mom, df_str, df_value, df_vol, df_beta])
x_col='date'
y_col = 'variance_ratio'
sns.pointplot(x=x_col,y=y_col,data=df,alpha=0.0001,hue='contribution')