In [1]:
from technical_features import BuildFeatures
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar
import warnings

warnings.filterwarnings("ignore")


In [2]:
sp1500 = pd.read_excel("../../data/SPR as of Feb 12 20251.xlsx")
tickers = sp1500['Ticker'].str.split(" ").str[0]

In [3]:
# Initialize a list to store IC results for all tickers
all_ic_results = []

# List to store error logs
error_logs = []

# Use tqdm to wrap the tickers list for a progress bar
for tick in tqdm(tickers, desc="Processing Tickers", unit="ticker"):

    # Initialize the BuildFeatures object for the current ticker
    obj = BuildFeatures(tick)
    obj.build_technical_features()  # Generate technical features for the ticker
    
    # Calculate daily returns from the stock's closing prices
    obj.stock["Return"] = obj.stock["Close"].pct_change()  # Daily percentage change
    
    # Merge technical features with the calculated returns
    data = obj.technical_features.copy()
    data["Return"] = obj.stock["Return"]
    
    # Remove rows with missing values
    data = data.dropna()
    
    # Calculate the IC (Information Coefficient) for each factor
    ic_values = {}
    for factor in data.columns[:-1]:  # Iterate over all columns except "Return"
        ic = data[factor].corr(data["Return"], method="spearman")  # Spearman correlation
        ic_values[factor] = ic  # Store the IC for the factor
    
    # Add the ticker information and store results in the list
    for factor, ic in ic_values.items():
        all_ic_results.append({"Ticker": tick, "Factor": factor, "IC": ic})


# Convert the results list to a DataFrame
all_ic_results_df = pd.DataFrame(all_ic_results)
all_ic_results_df.to_csv("all_factor_ic_results.csv", index=False)

Processing Tickers:   0%|          | 0/1506 [00:00<?, ?ticker/s]

Processing Tickers:   8%|▊         | 118/1506 [01:00<10:41,  2.16ticker/s]Failed to get ticker 'BRK/B' reason: Expecting value: line 1 column 1 (char 0)
$BRK/B: possibly delisted; no timezone found
Processing Tickers:   9%|▊         | 129/1506 [01:06<11:44,  1.95ticker/s]Failed to get ticker 'BF/B' reason: Expecting value: line 1 column 1 (char 0)
$BF/B: possibly delisted; no timezone found
Processing Tickers:  30%|███       | 454/1506 [04:19<10:57,  1.60ticker/s]Failed to get ticker 'MOG/A' reason: Expecting value: line 1 column 1 (char 0)
$MOG/A: possibly delisted; no timezone found
Processing Tickers:  89%|████████▉ | 1341/1506 [11:19<01:09,  2.39ticker/s]Failed to get ticker 'CWEN/A' reason: Expecting value: line 1 column 1 (char 0)
$CWEN/A: possibly delisted; no timezone found
Processing Tickers:  92%|█████████▏| 1378/1506 [11:36<00:55,  2.31ticker/s]Could not get exchangeTimezoneName for ticker 'SITC' reason: 'chart'
$SITC: possibly delisted; no timezone found
Processing Tickers:

       Ticker                              Factor        IC
0        VRSK                              SMA_50  0.010022
1        VRSK                             SMA_200  0.015788
2        VRSK                    Chaikin_ADI_Line  0.023624
3        VRSK              Chaikin_ADI_Oscillator  0.190907
4        VRSK                           ADM_Index  0.002483
...       ...                                 ...       ...
180715     RC          Midpoint_Price_over_period  0.000618
180716     RC            Lowest_value_over_period  0.002879
180717     RC   Index_of_lowest_value_over_period  0.011467
180718     RC           Highest_value_over_period -0.001741
180719     RC  Index_of_highest_value_over_period  0.011352

[180720 rows x 3 columns]


In [4]:
all_ic_results_df

Unnamed: 0,Ticker,Factor,IC
0,VRSK,SMA_50,0.010022
1,VRSK,SMA_200,0.015788
2,VRSK,Chaikin_ADI_Line,0.023624
3,VRSK,Chaikin_ADI_Oscillator,0.190907
4,VRSK,ADM_Index,0.002483
...,...,...,...
180715,RC,Midpoint_Price_over_period,0.000618
180716,RC,Lowest_value_over_period,0.002879
180717,RC,Index_of_lowest_value_over_period,0.011467
180718,RC,Highest_value_over_period,-0.001741


In [27]:
all_ic_results_df = pd.read_csv("all_factor_ic_results.csv")
factor_mean_ic = all_ic_results_df.groupby("Factor")["IC"].mean().reset_index()
factor_mean_ic.rename(columns={"IC": "Mean_IC"}, inplace=True)
top_30_factors = factor_mean_ic.sort_values(by="Mean_IC", ascending=False).head(30)


In [28]:
top_30_factors

Unnamed: 0,Factor,Mean_IC
6,Balance_of_Power,0.772767
57,Long_Line_Candle,0.514793
117,Williams_R,0.44308
14,Commodity_Channel_Index,0.358207
13,Chande_Momentum_Oscillator,0.324027
91,Relative_Strength_Index,0.324009
66,Marubozu,0.305188
19,Engulfing_Pattern,0.292736
89,Rate_of_Change_Ratio,0.275301
88,Rate_of_Change_Percentage,0.275301


In [3]:
zacks_fc = pd.read_csv("../../data/fundamental_data/ZACKS_FC.csv", delimiter=',')
zacks_fc['per_end_date'] = pd.to_datetime(zacks_fc['per_end_date'])
zacks_fc['per_end_date'] = zacks_fc['per_end_date'].dt.tz_localize('US/Eastern')
zacks_fc = zacks_fc.sort_values('per_end_date').set_index('per_end_date')

zacks_fr = pd.read_csv("../../data/fundamental_data/ZACKS_FR.csv", delimiter=',')
zacks_fr['per_end_date'] = pd.to_datetime(zacks_fr['per_end_date'])
zacks_fr['per_end_date'] = zacks_fr['per_end_date'].dt.tz_localize('US/Eastern')
zacks_fr = zacks_fr.sort_values('per_end_date').set_index('per_end_date')

# zacks_hdm = pd.read_csv("../data/fundamental_data/ZACKS_HDM.csv")
# zacks_mktval = pd.read_csv("../data/fundamental_data/ZACKS_MKTV.csv")
# zacks_mt = pd.read_csv("../data/fundamental_data/ZACKS_MT.csv")
# zacks_shrs = pd.read_csv("../data/fundamental_data/ZACKS_SHRS.csv")

zack_data = {}
zack_data["zacks_fc"] = zacks_fc
zack_data["zacks_fr"] = zacks_fr

In [5]:
from fundamental_features_new import BuildFeatures

In [19]:
all_ic_results = []

# List to store error logs
error_logs = []

# Use tqdm to wrap the tickers list for a progress bar
for tick in tqdm(tickers, desc="Processing Tickers", unit="ticker"):

    # Initialize the BuildFeatures object for the current ticker
    obj = BuildFeatures(zack_data, tick)
    obj.process_financial_statements()  # Generate technical features for the ticker
    
    # Merge technical features with the calculated returns
    data = obj.fundamental_features.copy()
    data["Return"] = data["Close"].pct_change()  

    # Calculate the IC (Information Coefficient) for each factor
    ic_values = {}
    for factor in data.columns[:-1]:  # Iterate over all columns except "Return"
        try:
            ic = data[factor].corr(data["Return"], method="spearman")  # Spearman correlation
        except: 
            continue
        ic_values[factor] = ic  # Store the IC for the factor
    
    # Add the ticker information and store results in the list
    for factor, ic in ic_values.items():
        all_ic_results.append({"Ticker": tick, "Factor": factor, "IC": ic})


# Convert the results list to a DataFrame
all_ic_results_df = pd.DataFrame(all_ic_results)
all_ic_results_df.to_csv("fundamental_ic_results.csv", index=False)

Processing Tickers:   8%|▊         | 118/1506 [00:39<07:06,  3.25ticker/s]Failed to get ticker 'BRK/B' reason: Expecting value: line 1 column 1 (char 0)
$BRK/B: possibly delisted; no timezone found
Processing Tickers:   9%|▊         | 129/1506 [00:44<08:50,  2.60ticker/s]Failed to get ticker 'BF/B' reason: Expecting value: line 1 column 1 (char 0)
$BF/B: possibly delisted; no timezone found
Processing Tickers:  13%|█▎        | 195/1506 [01:08<07:41,  2.84ticker/s]$ETR: possibly delisted; no price data found  (1d 1926-03-10 -> 2025-02-13)
Processing Tickers:  30%|███       | 454/1506 [02:58<06:52,  2.55ticker/s]  Failed to get ticker 'MOG/A' reason: Expecting value: line 1 column 1 (char 0)
$MOG/A: possibly delisted; no timezone found
Processing Tickers:  46%|████▋     | 698/1506 [04:19<03:55,  3.43ticker/s]$LXP: possibly delisted; no price data found  (1d 1926-03-10 -> 2025-02-13)
Processing Tickers:  58%|█████▊    | 880/1506 [05:12<03:16,  3.19ticker/s]$OUT: possibly delisted; no pric

In [21]:
all_ic_results_df

Unnamed: 0,Ticker,Factor,IC
0,VRSK,total revenue,-0.008774
1,VRSK,ebitda,0.020056
2,VRSK,eps,-0.048773
3,VRSK,Dividend Payout Ratio,
4,VRSK,total_current_assets,-0.043230
...,...,...,...
57304,RC,ret_invst,0.172869
57305,RC,free_cash_flow_per_share,0.057096
57306,RC,book_val_per_share,-0.113314
57307,RC,oper_cash_flow_per_share,0.040846


In [22]:
factor_mean_ic = all_ic_results_df.groupby("Factor")["IC"].mean().reset_index()
factor_mean_ic.rename(columns={"IC": "Mean_IC"}, inplace=True)


In [23]:
factor_mean_ic

Unnamed: 0,Factor,Mean_IC
0,Close,0.197943
1,Dividend Payout Ratio,
2,asset_turn,0.007277
3,book_val_per_share,0.001526
4,comb_ratio,-0.069644
5,comp_name,
6,comp_name_2,
7,curr_ratio,0.02119
8,currency_code,
9,day_sale_rcv,-0.002087
