This program imports ETF information from the various CSV files downloaded from ETFdb.com. A database table is created in MySQL manually since Pandas seems to have trouble keeping field data type consistent. Not all columns from CSV files were imported, only the one's relevant to selecting ETFs for backtesting.

In [1]:
import os

import datetime
import pandas as pd
import numpy as np

from tqdm import tqdm

import pymysql
import sqlalchemy as db
from sqlalchemy import create_engine

# connect to DB
engine = create_engine("mysql+pymysql://root:root@127.0.0.1:8889/trading?unix_socket=/Applications/MAMP/tmp/mysql/mysql.sock")

In [2]:
"""
    Import Info for Tickers
    
    'Symbol': db.types.VARCHAR(length=12),
    'ETFName': db.types.VARCHAR(length=100),
    'AssetClass': db.types.VARCHAR(length=50),
    'TotalAssets': db.types.BIGINT,
    'Inverse': db.types.VARCHAR(12),
    'Leveraged': db.types.VARCHAR(12),
    'Category': db.types.VARCHAR(length=50),
    'Inception': db.DateTime(),
    'ER': db.types.NUMERIC(6,2),
    'AnnualDividendRate': db.types.NUMERIC(8,2),
    'DividendDate': db.DateTime(),
    'Dividend': db.types.NUMERIC(8,2),
    'AnnualDividendYield': db.types.NUMERIC(8,2),
    'PERatio': db.types.NUMERIC(12,2),
    'Beta': db.types.INTEGER,
    'NumofHoldings': db.types.INTEGER,
    'InTop10': db.types.NUMERIC(6,2),
    'STCapGainRate': db.types.NUMERIC(6,2),
    'LTCapGainRate': db.types.NUMERIC(6,2),
    'LiquidityRating': db.types.VARCHAR(12),
    'ExpensesRating': db.types.VARCHAR(12),
    'ReturnsRating': db.types.VARCHAR(12),
    'VolatilityRating': db.types.VARCHAR(12),
    'DividendRating': db.types.VARCHAR(12),
    'ConcentrationRating': db.types.VARCHAR(12)})
"""
def import_info(Symbol, row, db_table):
    # SQL insert statements
    insert_init = """INSERT INTO {} (Symbol, ETFName, AssetClass, TotalAssets, Inverse, 
                        Leveraged, Category, Inception, ER, AnnualDividendRate, 
                        DividendDate, Dividend, AnnualDividendYield, PERatio, Beta, 
                        NumofHoldings, InTop10, STCapGainRate, LTCapGainRate, 
                        LiquidityRating, ExpensesRating, ReturnsRating, VolatilityRating,
                        DividendRating, ConcentrationRating) VALUES """.format(db_table)
    
    # add values to the insert statement
    vals = """('{}', "{}", '{}', {}, '{}', '{}', '{}', '{}', {}, {},
            '{}', {}, {}, {}, {}, {}, {}, {}, {}, '{}',
                            '{}', '{}', '{}', '{}', '{}')""".format(
        Symbol, row.ETFName, row.AssetClass, row.TotalAssets, row.Inverse, row.Leveraged, 
        row.Category, row.Inception, row.ER, row.AnnualDividendRate, row.DividendDate, 
        row.Dividend, row.AnnualDividendYield, row.PERatio, row.Beta, row.NumofHoldings, 
        row.InTop10, row.STCapGainRate, row.LTCapGainRate, row.LiquidityRating, 
        row.ExpensesRating, row.ReturnsRating, row.VolatilityRating, row.DividendRating, 
        row.ConcentrationRating)
    
    # handle duplicates
    insert_end = """ ON DUPLICATE KEY UPDATE  
                    ETFName = VALUES(ETFName), AssetClass = VALUES(AssetClass), 
                    TotalAssets = VALUES(TotalAssets), Inverse = VALUES(Inverse), 
                    Leveraged = VALUES(Leveraged), Category = VALUES(Category), 
                    Inception = VALUES(Inception), ER = VALUES(ER), 
                    AnnualDividendRate = VALUES(AnnualDividendRate), 
                    DividendDate = VALUES(DividendDate), Dividend = VALUES(Dividend), 
                    AnnualDividendYield = VALUES(AnnualDividendYield), 
                    PERatio = VALUES(PERatio), Beta = VALUES(Beta), 
                    NumofHoldings = VALUES(NumofHoldings), InTop10 = VALUES(InTop10), 
                    STCapGainRate = VALUES(STCapGainRate), 
                    LTCapGainRate = VALUES(LTCapGainRate), 
                    LiquidityRating = VALUES(LiquidityRating), 
                    ExpensesRating = VALUES(ExpensesRating), 
                    ReturnsRating = VALUES(ReturnsRating), 
                    VolatilityRating = VALUES(VolatilityRating),
                    DividendRating = VALUES(DividendRating), 
                    ConcentrationRating = VALUES(ConcentrationRating);"""
    
    # put parts together
    query = insert_init + vals + insert_end
#    print(query)
    result = engine.execute(query)

In [3]:
file_name = 'etfs_details_type_fund_flow-10'
data_path = '../data/processed/'

columns_retained = ['Symbol', 'ETF Name', 'Asset Class', 'Total Assets ', 'Inverse', 
                    'Leveraged', 'ETFdb.com Category', 'Inception', 'ER', 
                    'Annual Dividend Rate', 'Dividend Date', 'Dividend', 
                    'Annual Dividend Yield %', 'P/E Ratio', 'Beta', '# of Holdings', 
                    '% In Top 10', 'ST Cap Gain Rate', 'LT Cap Gain Rate', 
                    'Liquidity Rating', 'Expenses Rating', 'Returns Rating', 
                    'Volatility Rating', 'Dividend Rating','Concentration Rating'
                   ]

# Import CSV file
df = pd.read_csv('{}/{}.csv'.format(data_path, file_name), index_col = 'Symbol', usecols = columns_retained)

df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace('\%', '')
df.columns = df.columns.str.replace('/', '')
df.columns = df.columns.str.replace('#', 'Num')
df.columns = df.columns.str.replace('ETFdb.com', '')

# Remove $ sign and comma
amntval_mod = ['TotalAssets', 'AnnualDividendRate', 'Dividend']
for value in amntval_mod:
    df[value] = df[value].str.replace(',', '').str.replace('$', '')
    
# Remove % sign
pctval_mod = ['ER', 'InTop10', 'AnnualDividendYield', 'STCapGainRate', 'LTCapGainRate']
for value in pctval_mod:
    df[value] = df[value].str.replace('%', '')

# Enter Null in empty datetime field
date_mod = ['Inception', 'DividendDate']
for value in date_mod:
    df[value] = df[value].replace({np.nan: '1900-01-01'})

# Fill nan field with null
df.fillna(value = 'null', inplace = True)

# insert in DB
for Symbol, row in tqdm(df.iterrows(), desc = 'Processing ...'):
    import_info(Symbol, row, 'etfdb_info')
    
# Disconnect from DB    
engine.dispose()

Processing ...: 43it [00:00, 535.85it/s]


In [4]:
engine.dialect.has_table(engine, 'etfdb_info')

True