In [1]:
import sys
import os

# Get the absolute path of the project's root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Get the parent directory of the current working directory

# Construct the path to the `src` directory
src_path = os.path.join(project_root, "src")

# Add the `src` directory to the Python module search path
sys.path.append(src_path)

In [2]:
import importlib
from Features import technical_features_new

importlib.reload(technical_features_new)
from Features.technical_features_new import BuildFeatures

In [3]:
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar
import warnings
import numpy as np

warnings.filterwarnings("ignore")

In [4]:
sp1500 = pd.read_excel("../data/SPR as of Feb 12 20251.xlsx")
tickers = sp1500['Ticker'].str.split(" ").str[0]

In [5]:
tickers

0       VRSK
1       EVTC
2         AA
3        PFG
4       NXPI
        ... 
1501     EHC
1502    EMBC
1503     EXC
1504    TMUS
1505      RC
Name: Ticker, Length: 1506, dtype: object

In [11]:
import pickle as pkl
import gzip

# Initialize a list to store IC results for all tickers
all_ic_results = []

data_path = "../data/technical_features"

cutoff_date = "2014-01-01"

# Use tqdm to wrap the tickers list for a progress bar
for tick in tqdm(tickers, desc="Processing Tickers", unit="ticker"):
    file_path = os.path.join(data_path, f"{tick}.pkl.gz")

    # Initialize the BuildFeatures object for the current ticker
    try:
        with gzip.open(file_path, "rb") as f:
            data = pkl.load(f)
    except:
        continue
        
    data = data[data.index < cutoff_date]
    
    # Remove rows with missing values
    data = data.dropna()
    
    if len(data) < 20:
        continue
    
    # Calculate the IC (Information Coefficient) for each factor
    ic_values = {}
    for factor in data.columns[:-1]:  # Iterate over all columns except "Return"
        ic = data[factor].corr(data["Return"], method="spearman")  # Spearman correlation
        ic_values[factor] = ic  # Store the IC for the factor
    
    # Add the ticker information and store results in the list
    for factor, ic in ic_values.items():
        all_ic_results.append({"Ticker": tick, "Factor": factor, "IC": ic})


# Convert the results list to a DataFrame
all_ic_results_df = pd.DataFrame(all_ic_results)
# all_ic_results_df.to_csv("all_factor_ic_results.csv", index=False)

Processing Tickers: 100%|██████████| 1506/1506 [00:28<00:00, 52.73ticker/s]


In [12]:
all_ic_results_df

Unnamed: 0,Ticker,Factor,IC
0,VRSK,SMA_50,-0.04383761
1,VRSK,SMA_200,-0.01661130
2,VRSK,Chaikin_ADI_Line,-0.05064419
3,VRSK,Chaikin_ADI_Oscillator,-0.09926262
4,VRSK,ADM_Index,-0.18985496
...,...,...,...
140113,TMUS,Lowest_value_over_period,-0.19418339
140114,TMUS,Index_of_lowest_value_over_period,0.08175721
140115,TMUS,Highest_value_over_period,-0.19737416
140116,TMUS,Index_of_highest_value_over_period,0.08175721


In [13]:
#all_ic_results_df = pd.read_csv("all_factor_ic_results.csv")
factor_mean_ic = all_ic_results_df.groupby("Factor")["IC"].mean().reset_index()
factor_mean_ic.rename(columns={"IC": "Mean_IC"}, inplace=True)
top_30_factors = factor_mean_ic.sort_values(by="Mean_IC", ascending=False).head(30)


In [14]:
factor_mean_ic

Unnamed: 0,Factor,Mean_IC
0,ADM_Index,-0.00942805
1,ADM_Index_Rating,-0.00492322
2,Absolute_Price_Oscillator,-0.00485795
3,All_Moving_Average,-0.09689396
4,Average_Price,-0.10389486
...,...,...
116,Weighted_Close_Price,-0.10488385
117,Weighted_Moving_Average,-0.09769146
118,Williams_R,-0.06466734
119,one_day_rate_of_change_of_a_triple_smooth_ema,-0.01110176


In [15]:
factor_mean_ic[factor_mean_ic['Factor']=='Close']

Unnamed: 0,Factor,Mean_IC
14,Close,-0.10607177


In [45]:
top_30_factors

Unnamed: 0,Factor,Mean_IC
68,Mat_Hold,0.04529421
73,Minus_Directional_Indicator,0.03676635
48,Inverted_Hammer,0.01928142
25,Gravestone_Doji,0.01813985
79,Normalized_Average_True_Range,0.01396811
17,Doji_Star,0.0111366
111,Unique_3_River,0.00773571
47,Index_of_lowest_value_over_period,0.00735988
46,Index_of_highest_value_over_period,0.00734633
103,Stick_Sandwich,0.00716104


In [68]:
# Initialize a list to store IC results for all tickers
all_ic_results = []

data_path = "../data/fundamental_features"

cutoff_date = "2018-01-01"

# Use tqdm to wrap the tickers list for a progress bar
for tick in tqdm(tickers, desc="Processing Tickers", unit="ticker"):
    file_path = os.path.join(data_path, f"{tick}.pkl.gz")

    # Initialize the BuildFeatures object for the current ticker
    try:
        with gzip.open(file_path, "rb") as f:
            data = pkl.load(f)
            data = data.select_dtypes(include=[float, int])  # Keeps only numeric columns
    except:
        continue
    
    #combined_data = combined_data.dropna(subset = ['Return'])
    
    # **Step 1: Define the threshold for missing values**
    threshold = len(data) * 0.3 
    data = data.dropna(axis=1, thresh=threshold)
    # Remove rows with missing values
    data = data.dropna()
    if data.empty:
        continue
    
    data = data[data.index < cutoff_date]
    if data.empty:
        continue
    
    # Calculate the IC (Information Coefficient) for each factor
    ic_values = {}
    for factor in data.columns[:-1]:  # Iterate over all columns except "Return"
        ic = data[factor].corr(data["Return"], method="spearman")  # Spearman correlation
        ic_values[factor] = ic  # Store the IC for the factor
    
    # Add the ticker information and store results in the list
    for factor, ic in ic_values.items():
        all_ic_results.append({"Ticker": tick, "Factor": factor, "IC": ic})

# Convert the results list to a DataFrame
all_ic_results_df = pd.DataFrame(all_ic_results)
# all_ic_results_df.to_csv("all_factor_ic_results.csv", index=False)

Processing Tickers: 100%|██████████| 1506/1506 [00:08<00:00, 178.48ticker/s]


In [69]:
all_ic_results_df

Unnamed: 0,Ticker,Factor,IC
0,VRSK,total revenue,-0.04363976
1,VRSK,ebitda,-0.08187753
2,VRSK,eps,-0.13735382
3,VRSK,total_current_assets,-0.08377601
4,VRSK,per_fisc_year,-0.04442607
...,...,...,...
37183,RC,ret_asset,0.06048457
37184,RC,ret_invst,0.05763678
37185,RC,free_cash_flow_per_share,0.12623135
37186,RC,book_val_per_share,-0.10382049


In [70]:
factor_mean_ic = all_ic_results_df.groupby("Factor")["IC"].mean().reset_index()
factor_mean_ic.rename(columns={"IC": "Mean_IC"}, inplace=True)


In [71]:
factor_mean_ic

Unnamed: 0,Factor,Mean_IC
0,Close,-0.10398485
1,asset_turn,-0.01148144
2,book_val_per_share,-0.01657695
3,comb_ratio,0.03402489
4,curr_ratio,0.00211287
5,day_sale_rcv,-0.00890317
6,ebit_margin,-0.0272258
7,ebitda,-0.00501802
8,eps,-0.00787459
9,exp_ratio,0.02687046
