# Feature creation with yfinance data

### Setup

In [99]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import pandas_ta as ta
import warnings

## pandas-ta and ts-fresh.

The pandas-ta documentation is [here](https://twopirllc.github.io/pandas-ta/).

The ts-fresh documentation is [here](https://tsfresh.readthedocs.io/en/latest/text/list_of_features.html)

In [100]:
def create_features(ticker=None, data=None):
    if ticker is None or data is None:
        return None

    aberration_df = ta.aberration(data['High'], data['Low'], data['Close'], length=5, \
                                    atr_length=14)
    data = pd.concat([data, aberration_df], axis=1)

    bbands_df = ta.bbands(data['Close'], length=5, std=2, mamode='sma')
    data = pd.concat([data, bbands_df], axis=1)

    data['AO_5_34'] = ta.ao(data['High'], data['Low'], fast=5, slow=34)
    data['BOP'] = ta.bop(data['Open'], data['High'], data['Low'], data['Close'])
    data['MOM_2'] = ta.mom(data['Close'], length=2)
    data['MOM_4'] = ta.mom(data['Close'], length=4)
    data['MOM_6'] = ta.mom(data['Close'], length=6)
    data['SMA_30'] = ta.sma(data['Close'], length=30)
    data['SMA_60'] = ta.sma(data['Close'],length=60)
    data['SMA_90'] = ta.sma(data['Close'],length=90)
    data['RSI_14'] = ta.rsi(close=data['Close'], length=14)
    # Volume indicators
    data['CMF_20'] = ta.cmf(high=data['High'], low=data['Low'], close=data['Close'], \
                            volume=data['Volume'], length=20)
    data['AD'] = ta.ad(high=data['High'], low=data['Low'], close=data['Close'], \
                            volume=data['Volume'])

    # Calculate the stochastic oscillator
    stoch = ta.stoch(high=data['High'], low=data['Low'], close=data['Close'], k=14, d=3, smooth_k=3)
    # Add the stochastic oscillator values to the DataFrame
    data['%K_14_3_3'] = stoch['STOCHk_14_3_3']
    data['%D_14_3_3'] = stoch['STOCHd_14_3_3']


    data = data.astype('float64')
    data = data.add_prefix(f'{ticker}_')
    # Print the updated DataFrame with Aberration values
    data_filtered = data.iloc[100:]
    

    return data_filtered


In [111]:
from sklearn.feature_selection import mutual_info_classif
from pathlib import Path

# tickers = ['QCOM']
# tickers = ['QCOM', 'ASML', 'MU', 'FSLR', 'ON', 'TSM', 'AMD', 'NVDA', 'INTC']
# tickers = ['MSFT', 'AAPL', 'AMZN', 'GOOGL', 'JNJ', 'JPM', 'SPY', 'BRK-B', 'XOM', 'V', 'PG', 'HD', 'NVDA', 'CVX', 'META', 'PFE', 'MRK', 'PEP', 'ABBV', 'UNH', 'DIS', 'CSCO', 'COST', 'MCD', 'TSLA', 'VZ', 'KO', 'WMT', 'BAC', 'ABT', 'MA', 'LLY', 'INTC', 'IVV', 'QQQ', 'BMY', 'IBM', 'AMGN', 'UNP', 'T', 'ADBE', 'NEE', 'QCOM', 'ORCL', 'RTX', 'TMO', 'NKE', 'SBUX', 'HON', 'CVS', 'LOW', 'CMCSA', 'CAT', 'LMT', 'UPS', 'CRM', 'AVGO', 'BA', 'TXN', 'COP', 'DE', 'ACN', 'DHR', 'WFC', 'MMM', 'AXP', 'PM', 'ADP', 'MDT', 'NFLX', 'TGT', 'GLD', 'AMD', 'MO', 'GS', 'MDLZ', 'BLK', 'DUK', 'EMR', 'AMAT', 'TJX', 'WM', 'SO', 'AMT', 'MS', 'GILD', 'F', 'USB', 'SYK', 'C', 'SCHW', 'CL', 'FDX', 'KMB', 'BX', 'INTU', 'SPGI', 'GIS', 'GE']
tickers  = ['AAPL', 'ADBE', 'ADI', 'ADP', 'ADPT', 'ADSK', 'AMAT', 'AMBA', 'AMD', 'AMZN', 'ANET', 'ARKK', 'ASML', 'ATER', 'AVAV', 'AVGO', 'AYX', 'BABA', 'BB', 'BIDU', 'BILI', 'BKNG', 'BL', 'BLUE', 'BOX', 'BSX', 'BYND', 'CCJ', 'CDNS', 'CDW', 'CHGG', 'CHKP', 'CHWY', 'CMCSA', 'CORT', 'CRM', 'CRSP', 'CRWD', 'CSCO', 'CSIQ', 'CVNA', 'CYBR', 'DBX', 'DIS', 'DKNG', 'DNN', 'DOCU', 'DT', 'DXCM', 'EA', 'EB', 'EBAY', 'EDIT', 'ENPH', 'ESTC', 'ETSY', 'EXAS', 'EXPE', 'FATE', 'FCEL', 'FI', 'FIS', 'FSLY', 'FTCH', 'FTNT', 'FUBO', 'FUTU', 'FVRR', 'GDS', 'GLOB', 'GME', 'GNRC', 'GOGO', 'GOOGL', 'GPRO', 'GRPN', 'HIMX', 'HPE', 'HUBS', 'IAC', 'ILMN', 'IMAX', 'INTC', 'INTU', 'IONS', 'ISRG', 'JD', 'KLAC', 'KOPN', 'KURA', 'KWEB', 'LC', 'LITE', 'LOGI', 'LRCX', 'LULU', 'LYFT', 'MARA', 'MCHP', 'MDB', 'MELI', 'META', 'MGNI', 'MRNA', 'MRVL', 'MSFT', 'MSTR', 'MU', 'MVIS', 'MVST', 'NFLX', 'NICE', 'NIO', 'NKLA', 'NOW', 'NTAP', 'NTDOY', 'NTES', 'NTLA', 'NTNX', 'NVAX', 'NVDA', 'NVTA', 'NXPI', 'NYT', 'OKTA', 'ON', 'ORCL', 'PACB', 'PANW', 'PARA', 'PAYC', 'PD', 'PDD', 'PENN', 'PINS', 'PLUG', 'PSTG', 'PYPL', 'QCOM', 'RDFN', 'REAL', 'RNG', 'ROKU', 'RVLV', 'SABR', 'SAP', 'SBGI', 'SE', 'SFIX', 'SFTBY', 'SGBI', 'SGML', 'SHOP', 'SMAR', 'SMCI', 'SMH', 'SNAP', 'SNPS', 'SOHU', 'SONO', 'SONY', 'SPCE', 'SPLK', 'SPOT', 'SQ', 'STM', 'T', 'TDC', 'TDOC', 'TEAM', 'TENB', 'TIGR', 'TNDM', 'TSLA', 'TTD', 'TTWO', 'TWLO', 'TXN', 'UBER', 'UPWK', 'VEEV', 'VIPS', 'VZ', 'W', 'WB', 'WBD', 'WDAY', 'WDC', 'WIX', 'XBI', 'YELP', 'YEXT', 'Z', 'ZM', 'ZS']
# Exclude ATVI, NEWR, WWE, TRUE
index_tickers = ["^DJI", "^GSPC", "^IXIC"]
treas_tickers = ["^TNX", "^FVX"]
period = "10y"

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    i = 0
    for ticker in tickers:
        i = i + 1
        print(f'{i:3d}:{ticker}')
        data = yf.download(ticker, period=period)
        data_filtered = create_features(ticker=ticker, data=data)
        # data_filtered.loc[:, 'Name'] = ticker

        all_df = data_filtered

        # # Add indices
        # for idx in index_tickers:
        #     data = yf.download(idx, period=period)
        #     data_filtered = create_features(ticker=idx, data=data)
        #     assert all_df.shape[0] == data_filtered.shape[0]
        #     all_df = pd.concat([all_df, data_filtered], axis=1)

        # # Add treasury yields
        # for treas in treas_tickers:
        #     data = yf.download(treas, period=period)
        #     data_filtered = create_features(ticker=treas, data=data)
        #     # print(all_df.shape[0])
        #     # print(data_filtered.shape[0])

        #     # Get the difference between the indexes
        #     index_difference = all_df.index.difference(data_filtered.index)
        #     # Delete the rows with different indexes from df1
        #     all_df = all_df.drop(index_difference)

        #     # Print the difference
        #     # print(index_difference)
        #     assert all_df.shape[0] == data_filtered.shape[0]
        #     all_df = pd.concat([all_df, data_filtered], axis=1)

        
        # Get the columns with only one unique value
        single_value_cols = all_df.columns[all_df.nunique() == 1]

        # Drop the columns with only one unique value
        all_df = all_df.drop(single_value_cols, axis=1)

        all_df = all_df.dropna(axis=1)
        # print(all_df.isnull().sum())

        y = (all_df[f'{ticker}_Close'].pct_change().shift(-1) > 0).astype(int)
        X = all_df.drop([f'{ticker}_Close'], axis=1)


        # Compute mutual information
        mi_scores = mutual_info_classif(X, y)

        # Sort features by mutual information value
        sorted_features = sorted(zip(X.columns, mi_scores), key=lambda x: x[1], reverse=True)

        # Print the sorted features and their mutual information scores
        # for feature, score in sorted_features:
        #     print(f'{feature}: {score}')

        # Get the columns with mutual information above 0.01, excluding the off-limits column
        selected_columns = X.columns[mi_scores > 0.005]

        # Filter the DataFrame based on selected columns
        df_filtered = X[selected_columns]
        df_filtered = df_filtered.copy()

        # df_filtered.loc[:, 'Name'] = ticker
        df_filtered.loc[:, 'Name'] = ticker
        df_filtered[f'{ticker}_Close'] = all_df[f'{ticker}_Close']
        Path(f'./data/data-2d/{ticker}').mkdir(parents=True, exist_ok=True)
        df_filtered.to_csv(f'./data/data-2d/{ticker.upper()}/{ticker.upper()}.csv', index=True)


  1:AAPL
[*********************100%%**********************]  1 of 1 completed
  2:ADBE
[*********************100%%**********************]  1 of 1 completed
  3:ADI
[*********************100%%**********************]  1 of 1 completed
  4:ADP
[*********************100%%**********************]  1 of 1 completed
  5:ADPT
[*********************100%%**********************]  1 of 1 completed
  6:ADSK
[*********************100%%**********************]  1 of 1 completed
  7:AMAT
[*********************100%%**********************]  1 of 1 completed
  8:AMBA
[*********************100%%**********************]  1 of 1 completed
  9:AMD
[*********************100%%**********************]  1 of 1 completed
 10:AMZN
[*********************100%%**********************]  1 of 1 completed
 11:ANET
[*********************100%%**********************]  1 of 1 completed
 12:ARKK
[*********************100%%**********************]  1 of 1 completed
 13:ASML
[*********************100%%**********************]  1 of 1

In [86]:
# Specify the file path
file_path = './data/top-100-tickers.txt'

# Read the file and modify each line
with open(file_path, 'r') as file:
    lines = file.readlines()
    lines = [f'{line.strip()}' for line in lines]

# Split the modified lines by commas and create a Python list
lines_list = ','.join(lines).split(',')

# Print the resulting list
print(lines_list)

['MSFT', 'AAPL', 'AMZN', 'GOOGL', 'JNJ', 'JPM', 'SPY', 'BRK-B', 'XOM', 'V', 'PG', 'HD', 'NVDA', 'CVX', 'META', 'PFE', 'MRK', 'PEP', 'ABBV', 'UNH', 'DIS', 'CSCO', 'COST', 'MCD', 'TSLA', 'VZ', 'KO', 'WMT', 'BAC', 'ABT', 'MA', 'LLY', 'INTC', 'IVV', 'QQQ', 'BMY', 'IBM', 'AMGN', 'UNP', 'T', 'ADBE', 'NEE', 'QCOM', 'ORCL', 'RTX', 'TMO', 'NKE', 'SBUX', 'HON', 'CVS', 'LOW', 'CMCSA', 'CAT', 'LMT', 'UPS', 'CRM', 'AVGO', 'BA', 'TXN', 'COP', 'DE', 'ACN', 'DHR', 'WFC', 'PYPL', 'MMM', 'AXP', 'PM', 'ADP', 'MDT', 'NFLX', 'TGT', 'GLD', 'AMD', 'MO', 'GS', 'MDLZ', 'BLK', 'DUK', 'EMR', 'AMAT', 'TJX', 'WM', 'SO', 'AMT', 'MS', 'GILD', 'F', 'USB', 'SYK', 'C', 'SCHW', 'CL', 'FDX', 'KMB', 'BX', 'INTU', 'SPGI', 'GIS', 'GE']


In [None]:
data_filtered.count()

In [None]:
data_filtered.info()

In [None]:
# Write the DataFrame to a CSV file
data_filtered.loc[:, 'Name'] = 'NVDA'
data_filtered.to_csv('./Dataset/NVDA.csv', index=True)

In [112]:
import yfinance as yf

# Get the ticker data
ticker = yf.Ticker("AAPL")

# Get the insider trading data
insider_trading = ticker.get_insider_trades()

AttributeError: 'Ticker' object has no attribute 'get_insider_trades'