In [100]:
import pandas as pd
from gensim.models import Word2Vec
import numpy

In [101]:
STOCKS_DIR = '../Data Files/Stocks'
STOCKS_EXT = '.txt'
COMPANIES_FP = '../Data Files/companies.csv'
FUNDAMENTALS_FP = '../Data Files/fundamentals_dataset.csv'

In [102]:
def shared_tickers(df1,df2):
    df1_tickers = pd.unique(df1['ticker'])
    df2_tickers = pd.unique(df2['ticker'])
    return df1_tickers[[ticker in df2_tickers for ticker in df1_tickers]]

def reduce(df, tickers):
    return df[[ticker in tickers for ticker in df['ticker']]]

In [103]:
# Read Data Files into Pandas DataFrames
comp_df = pd.read_csv(COMPANIES_FP)
fund_df = pd.read_csv(FUNDAMENTALS_FP)
#stock_ticker_list

In [104]:
# Remove unneeded companies data
comp_df = comp_df[['ticker','industry','sector','tag 1','tag 2','tag 3']]

In [105]:
# Remove bad companies data
comp_df_clean_targets = comp_df[['ticker','industry','sector','tag 1']]
comp_df = comp_df[comp_df_clean_targets.notna().all(axis=1)]
comp_df[:10]

Unnamed: 0,ticker,industry,sector,tag 1,tag 2,tag 3
0,A,Medical Diagnostics & Research,Healthcare,Healthcare,Diagnostics & Research,Medical Diagnostics & Research
1,AA,Metals & Mining,Basic Materials,Basic Materials,Aluminum,Metals & Mining
2,AABA,Asset Management,Financial Services,Financial Services,Asset Management,
3,AAC,Health Care Providers,Healthcare,Healthcare,Medical Care,Health Care Providers
5,AAL,Airlines,Industrials,Industrials,Airlines,
6,AAMC,Asset Management,Financial Services,Financial Services,Asset Management,
7,AAME,Insurance - Life,Financial Services,Financial Services,Insurance - Life,
8,AAN,Consulting & Outsourcing,Industrials,Industrials,Rental & Leasing Services,Consulting & Outsourcing
9,AAOI,Semiconductors,Technology,Technology,Semiconductors,
10,AAON,Building Materials,Basic Materials,Basic Materials,Building Materials,


In [115]:
# Generate Companies DataFrame Format 1
# ticker, industry, sector, tag 1, tag 2, tag 3
comp_df_out = comp_df

In [108]:
# Generate Companies DataFrame Format 2
# ticker, data_name, data_val
comp_df_out = pd.DataFrame(columns=['ticker','data_name','data_val'])
for row in comp_df.iterrows():
    new_rows_df = pd.DataFrame([
        [row[1]['ticker'],'industry',row[1]['industry']],
        [row[1]['ticker'],'sector',row[1]['sector']],
        [row[1]['ticker'],'tag 1',row[1]['tag 1']],
        [row[1]['ticker'],'tag 2',row[1]['tag 2']],
        [row[1]['ticker'],'tag 3',row[1]['tag 3']]
    ],columns=['ticker','data_name','data_val'])
    comp_df_out = comp_df_out.append(new_rows_df)

In [116]:
comp_df_out[:5]

Unnamed: 0,ticker,industry,sector,tag 1,tag 2,tag 3
0,A,Medical Diagnostics & Research,Healthcare,Healthcare,Diagnostics & Research,Medical Diagnostics & Research
1,AA,Metals & Mining,Basic Materials,Basic Materials,Aluminum,Metals & Mining
2,AABA,Asset Management,Financial Services,Financial Services,Asset Management,
3,AAC,Health Care Providers,Healthcare,Healthcare,Medical Care,Health Care Providers
5,AAL,Airlines,Industrials,Industrials,Airlines,


In [95]:
# Remove unneeded Fundamentals data
fund_df = fund_df[['period','tickers','indicator','amount']]

In [96]:
# Remove bad data
# Many have 0 for the amount when it doesn't make sense (e.g. $0 final revenue)
# Some 0 amounts may be legitimate, too many bad to sort through
# Tossing all 0 amounts
#fund_df[fund_df['amount'] == '0'][:20]
fund_df = fund_df[fund_df['amount'] != '0']

In [97]:
# Create expanded fundamentals.

# Some records have multiple tickers. Will split tickers column on ','
fund_df_out = pd.DataFrame(columns=['period','ticker','data_name','data_val'])
for row in fund_df.iterrows():
    row[1]['amount'] = "${}".format(row[1]['amount'])
for row in fund_df[[',' in x for x in fund_df['tickers']]].iterrows():
    for ticker in row[1]['tickers'].split(', '):
        new_row_df = pd.DataFrame([
            [row[1]['period'],ticker,row[1]['indicator'],row[1]['amount']]
        ], columns=['period','ticker','data_name','data_val'])
        fund_df_out = fund_df_out.append(new_row_df)
fund_df_out = fund_df_out.append(fund_df[[',' not in x for x in fund_df['tickers']]].rename(columns={'tickers':'ticker','indicator':'data_name','amount':'data_val'}))

In [98]:
print(fund_df_out[:20])
print(fund_df_out.dtypes)

    period ticker                                          data_name  \
0  2014 Q1   DMJT                                  Net Income (Loss)   
0  2014 Q1   MARK                                  Net Income (Loss)   
0  2014 Q1   DMJT                                       Total Equity   
0  2014 Q1   MARK                                       Total Equity   
0  2014 Q2   DMJT                                  Net Income (Loss)   
0  2014 Q2   MARK                                  Net Income (Loss)   
0  2014 Q2   DMJT                            Operating Income (Loss)   
0  2014 Q2   MARK                            Operating Income (Loss)   
0  2014 Q3   DMJT                                             Assets   
0  2014 Q3   MARK                                             Assets   
0  2014 Q3   DMJT       Cash and Cash Equivalents, at Carrying Value   
0  2014 Q3   MARK       Cash and Cash Equivalents, at Carrying Value   
0  2014 Q3   DMJT  Cash and Cash Equivalents, Period Increase (D

In [117]:
shared = shared_tickers(comp_df_out,fund_df_out)

In [118]:
comp_df_out = reduce(comp_df_out, shared)
fund_df_out = reduce(fund_df_out, shared)

In [119]:
# Combine Companies and Fundamentals dataframes
combined_df_out = comp_df_out.append(fund_df_out)

In [120]:
combined_df_out.to_csv("../Data Files/Preformatted/preformatted.csv",index=False)

In [121]:
comp_df_out.to_csv("../Data Files/Preformatted/preformatted_companies.csv",index=False)
fund_df_out.to_csv("../Data Files/Preformatted/preformatted_fundamentals.csv",index=False)

In [None]:
# Load stock data

# Create multiple data formats
    # ticker, date, delta_open, delta_close, delta_low, delta_high
    # date, ticker, delta_open, delta_close, delta_low, delta_high
    # date, delta_open, delta_close, ticker, delta_low, delta_high
    # date, delta_open, delta_close, delta_low, delta_high, ticker
    # ticker, date, data_name, data_val
    # date, ticker, data_name, data_val

In [None]:
# Filter dataframes to contain only same tickers