In [1]:
# Loading packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# Load Data
data = pd.read_csv('daily.csv', parse_dates=['datadate'], low_memory=False)

# Calculate returns for each date, each stock and sort data based on TIC and DATE
data = data.sort_values(by=['tic', 'datadate'])
data['ret'] = data.groupby('tic')['prccd'].pct_change()
data['ret'] = pd.to_numeric(data['ret'], errors='coerce')


In [3]:
# Choose stocks which have full trading dates during this period
data_summary = data.groupby('tic').agg(number=('tic', 'size'), gic=('gsector', 'mean')).reset_index()
data_summary = data_summary[data_summary['number'] == 105]

print(data_summary)

# Stocks that experienced a consecutive three-day decline in the last seven days are ruled out
data_1 = data[data['tic'].isin(data_summary['tic'])]
data_1 = data_1.sort_values(by=['tic', 'datadate']).groupby('tic').tail(9)

data_1['consecutive_neg'] = (data_1['ret'].shift(2) < 0) & (data_1['ret'].shift(1) < 0) & (data_1['ret'] < 0)
data_1 = data_1.sort_values(by=['tic', 'datadate']).groupby('tic').tail(7)
data_1 = data_1.groupby('tic').filter(lambda x: not x['consecutive_neg'].any())

      tic  number   gic
0       A     105  35.0
1     AAL     105  20.0
2    AAPL     105  45.0
3    ABBV     105  35.0
4    ABNB     105  25.0
..    ...     ...   ...
640   XYL     105  20.0
641   YUM     105  25.0
642   ZBH     105  35.0
643  ZBRA     105  45.0
644   ZTS     105  35.0

[620 rows x 3 columns]


# Method 1: Using Cummulative Returns to select stocks

In [4]:
# Calculate cummulative retunrs
data_2 = data[data['tic'].isin(data_1['tic'])].dropna(subset=['ret'])
data_2['cumret'] = data_2.groupby('tic',group_keys=False)['ret'].apply(lambda x: (1 + x).cumprod())
data_2 = data_2.groupby('tic').tail(1).sort_values(by='cumret', ascending=False)


# Select stocks based on cummulative returns
cumret_base_select = []
count = [0] * 100
gic_all = data['gsector'].unique()


for i in range(284):
    gic_index = int(data_2.iloc[i, 9]) 
    count[gic_index] += 1
    if count[gic_index] > 5:
        continue
    cumret_base_select.append(data_2.iloc[i, 3])  
    if all(np.array([count[g] for g in gic_all]) >= 2):
        break
cumret_base_select = [x for x in cumret_base_select if pd.notna(x)]

# Data normalization and standardization
scaler = MinMaxScaler()
standardizer = StandardScaler()
data_cum = data[data['tic'].isin(cumret_base_select)].copy().loc[:,['datadate','tic','gsector','prccd','ret']]
data_cum.loc[:, 'n_prccd'] = scaler.fit_transform(data_cum[['prccd']])
data_cum.loc[:, 'n_ret'] = scaler.fit_transform(data_cum[['ret']])
data_cum.loc[:, 's_prccd'] = standardizer.fit_transform(data_cum[['prccd']])
data_cum.loc[:, 's_ret'] = standardizer.fit_transform(data_cum[['ret']])
data_cum = data_cum.dropna()



# Method 2: Using Standard Deviation of Returns to select stocks

In [5]:
# Calculate standard deviations
data_3 = data[data['tic'].isin(data_1['tic'])].dropna(subset=['ret'])
data_3['sd'] = data_3.groupby('tic')['ret'].transform('std')
data_3 = data_3.groupby('tic').tail(1).sort_values(by='sd', ascending=True)

# Select stocks based on standard deviations
sd_base_select = []
count = [0] * 100


for i in range(284):
    gic_index = int(data_3.iloc[i, 9])  
    count[gic_index] += 1
    if count[gic_index] > 5:
        continue
    sd_base_select.append(data_3.iloc[i, 3]) 
    if all(np.array([count[g] for g in gic_all]) >= 2):
        break


sd_base_select = [x for x in sd_base_select if pd.notna(x)]


# Data normalization and standardization
data_sd = data[data['tic'].isin(sd_base_select)].copy().loc[:,['datadate','tic','gsector','prccd','ret']]
data_sd.loc[:, 'n_prccd'] = scaler.fit_transform(data_sd[['prccd']])
data_sd.loc[:, 'n_ret'] = scaler.fit_transform(data_sd[['ret']])
data_sd.loc[:, 's_prccd'] = standardizer.fit_transform(data_sd[['prccd']])
data_sd.loc[:, 's_ret'] = standardizer.fit_transform(data_sd[['ret']])
data_sd = data_sd.dropna()


# Method 3: Using Training Set (70) Average Returns to select stocks

In [6]:
# Calculate average returns
data_4 = data[(data['tic'].isin(data_1['tic'])) & (data['datadate'] <= '2024-04-12')].copy()
data_4['avg']= data_4.groupby('tic')['ret'].transform('mean')
data_4 = data_4.groupby('tic').tail(1).sort_values(by='avg', ascending=False)



# Select stocks based on average returns
avg_base_select = []
count = [0] * 100


for i in range(284):
    gic_index = int(data_4.iloc[i, 9])  
    count[gic_index] += 1
    if count[gic_index] > 5:
        continue
    avg_base_select.append(data_4.iloc[i, 3]) 
    if all(np.array([count[g] for g in gic_all]) >= 2):
        break

avg_base_select = [x for x in avg_base_select if pd.notna(x)]

# Data normalization and standardization
data_avg = data[data['tic'].isin(avg_base_select)].copy().loc[:,['datadate','tic','gsector','prccd','ret']]
data_avg.loc[:, 'n_prccd'] = scaler.fit_transform(data_avg[['prccd']])
data_avg.loc[:, 'n_ret'] = scaler.fit_transform(data_avg[['ret']])
data_avg.loc[:, 's_prccd'] = standardizer.fit_transform(data_avg[['prccd']])
data_avg.loc[:, 's_ret'] = standardizer.fit_transform(data_avg[['ret']])
data_avg = data_avg.dropna()
print(data_avg)

        datadate  tic  gsector   prccd       ret   n_prccd     n_ret  \
43969 2024-01-03  ALL       40  145.00  0.008275  0.044751  0.403731   
43970 2024-01-04  ALL       40  148.50  0.024138  0.045831  0.420275   
43971 2024-01-05  ALL       40  149.98  0.009966  0.046288  0.405495   
43972 2024-01-08  ALL       40  149.38 -0.004001  0.046103  0.390928   
43973 2024-01-09  ALL       40  150.98  0.010711  0.046597  0.406271   
...          ...  ...      ...     ...       ...       ...       ...   
46754 2024-05-24  WRK       15   54.13  0.020935  0.016698  0.416935   
46755 2024-05-28  WRK       15   53.32 -0.014964  0.016448  0.379494   
46756 2024-05-29  WRK       15   53.43  0.002063  0.016482  0.397252   
46757 2024-05-30  WRK       15   54.24  0.015160  0.016732  0.410911   
46758 2024-05-31  WRK       15   53.64 -0.011062  0.016547  0.383564   

        s_prccd     s_ret  
43969 -0.282289  0.210486  
43970 -0.274591  0.795834  
43971 -0.271336  0.272903  
43972 -0.272656 -0.2424

In [8]:
data_avg.tic.nunique()

48

In [12]:
data_avg.tic.unique()

array(['ALL', 'AMD', 'AXP', 'AZO', 'BALL', 'BRO', 'BSX', 'CEG', 'CELG.R',
       'CHD', 'CMI', 'COST', 'CTLT', 'DECK', 'DIS', 'DLR', 'EBAY', 'ETN',
       'EW', 'GE', 'GOOG', 'GOOGL', 'HIG', 'HLT', 'IRM', 'JNPR', 'LLY',
       'MLM', 'MPC', 'NFLX', 'NRG', 'NTAP', 'NVDA', 'OKE', 'ORLY',
       'PARAA', 'PEG', 'PGR', 'STLD', 'STZ', 'TRGP', 'TT', 'UBER', 'VMC',
       'VST', 'WDC', 'WMB', 'WRK'], dtype=object)

In [14]:
data_avg

Unnamed: 0,datadate,tic,gsector,prccd,ret,n_prccd,n_ret,s_prccd,s_ret
43969,2024-01-03,ALL,40,145.00,0.008275,0.044751,0.403731,-0.282289,0.210486
43970,2024-01-04,ALL,40,148.50,0.024138,0.045831,0.420275,-0.274591,0.795834
43971,2024-01-05,ALL,40,149.98,0.009966,0.046288,0.405495,-0.271336,0.272903
43972,2024-01-08,ALL,40,149.38,-0.004001,0.046103,0.390928,-0.272656,-0.242473
43973,2024-01-09,ALL,40,150.98,0.010711,0.046597,0.406271,-0.269137,0.300379
...,...,...,...,...,...,...,...,...,...
46754,2024-05-24,WRK,15,54.13,0.020935,0.016698,0.416935,-0.482154,0.677664
46755,2024-05-28,WRK,15,53.32,-0.014964,0.016448,0.379494,-0.483935,-0.647023
46756,2024-05-29,WRK,15,53.43,0.002063,0.016482,0.397252,-0.483693,-0.018728
46757,2024-05-30,WRK,15,54.24,0.015160,0.016732,0.410911,-0.481912,0.464550
