# <span style="color:maroon">**NRP Stock Simulation Software**</span>

###### <span style="color:green">**Import modules and define utility functions**</span>

In [41]:
import pandas as pd
import numpy as np
import math
import glob
from matplotlib import pyplot as plt
%matplotlib inline

In [42]:
# Utility Functions
def combine_dataframes(pattern, names, index_col, na_values, header=1, parse_dates=True, ignore_index=True):
    # creates a list of file pointers matching the pattern
    files = glob.glob(pattern)                        
    frames = []
   
    # iterate through each file pointer
    for f in files:
    
        # loads file into dataframe variable: df
        df = pd.read_csv(f, names=names, index_col=index_col, na_values=na_values,
                         header=header, parse_dates=parse_dates)                           
        
        # adds dataframe from df into list: frames
        frames.append(df)  
    
    # concatenate all dataframes in frames into a single dataframe
    return pd.concat(frames, ignore_index=ignore_index)    

def fillgaps(df, mode="fill_zero"):
    s = []
    if mode=="fill_zero":
        df.apply(lambda col: s.append(col.loc[col.first_valid_index():col.last_valid_index()].fillna(0)))
    elif mode=="fill_forward":
        df.apply(lambda col: s.append(col.loc[col.first_valid_index():col.last_valid_index()].ffill()))
    return pd.DataFrame(s).transpose()

def df_eda(df):
    print(df.head(), "\n")
    print(df.tail(), "\n")
    print(df.shape, "\n")
    print(df.columns, "\n")
    print(df.info(), "\n")
    print(df.dtypes, "\n")
    print(df.describe())

###### <span style="color:green">**Load raw stock data files**</span>

In [3]:
# load stock data into a dataframe: stocks
cols = ["ticker", "date", "close", "cap", "volume"]
stocks = combine_dataframes("./data/stock_data_*.csv", names=cols, index_col="date", na_values=" #N/A N/A ", ignore_index=False)

###### <span style="color:green">**Pivot stock data columns into tidy dataframes**</span>

In [40]:
close = pd.pivot(stocks, columns="ticker", values="close")
close = fillgaps(close, "fill_forward")

cap = pd.pivot_table(stocks, columns="ticker", values="cap")
cap = fillgaps(cap, "fill_forward")

volume = pd.pivot_table(stocks, columns="ticker", values="volume")
volume = fillgaps(volume, "fill_zero")

###### <span style="color:violet">**Examining data in the daily stock data dataframe**</span>

In [None]:
df_eda(prices)

In [None]:
print(close.shape, cap.shape, volume.shape, "\n")
print(close.tail(), "\n")
print(cap.tail(), "\n")
print(volume.tail())

In [None]:
# plot histogram of volume - note the use of logarithmic transforms of both the x and y axis
df.volume.plot(kind="hist", rot=70, logx=True, logy=True, figsize=(12, 6))

In [None]:
# Boxplots are great when you have a numeric column that you want to compare across different categories.
df.boxplot(column="volume", by="ticker", rot=90, figsize=(12,6))

In [None]:
# When you want to visualize two numeric columns, scatter plots are ideal.
# Notice the fan shapped pattern - why is that the case?
df[df["ticker"]=="MU"].plot(kind="scatter", x="close", y="cap", rot=90, figsize=(12,6))

#### <span style="color:orange">**Learning Exercises**</span>

In [49]:
cols = ["ticker", "date", "close", "cap", "volume"]
df = combine_dataframes("./data/stock_data_*.csv", names=cols, index_col="date", na_values=" #N/A N/A ", ignore_index=False)

In [28]:
# "date" is the index
# "ticker", "close", "cap", and "volume" are the columns
df.head()

Unnamed: 0_level_0,ticker,close,cap,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1985-01-02,IBM,13.3649,74134.9,3490000.0
1985-01-03,IBM,13.2545,73522.3,5552400.0
1985-01-04,IBM,13.2269,73369.1,4028000.0
1985-01-07,IBM,13.2959,73752.0,4671200.0
1985-01-08,IBM,13.2269,73369.1,4492400.0


In [50]:
# the df_pivot_1 throws an error because you're trying to set an index based on a pre-existing index
# ERROR --> df_pivot_1 = pd.pivot(data=df, index="date", columns="ticker", values="close")

# in the code below, since "date" is an index in df, you do not have to set the index parameter in the call
df_pivot_1 = pd.pivot(data=df, columns="ticker", values="close")

In [51]:
df_pivot_1.head()

ticker,AAPL,AMAT,AMD,CSCO,IBM,INTC,MSFT,MU,NVDA,ORCL,WDC
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1984-12-31,0.413,0.1617,,,,0.3688,,2.6757,,,3.5353
1985-01-02,0.395,0.1572,14.496,,13.3649,0.3622,,2.6634,,,3.4328
1985-01-03,0.402,0.1572,14.4335,,13.2545,0.3688,,2.6021,,,3.5865
1985-01-04,0.402,0.1557,14.496,,13.2269,0.3754,,2.5039,,,3.689
1985-01-07,0.401,0.1527,14.3086,,13.2959,0.3853,,2.4057,,,3.8939


In [53]:
# stratify the pivot table
df_pivot_2 = pd.pivot(data=df, columns="ticker")

In [57]:
df_pivot_2.head()

Unnamed: 0_level_0,close,close,close,close,close,close,close,close,close,close,...,volume,volume,volume,volume,volume,volume,volume,volume,volume,volume
ticker,AAPL,AMAT,AMD,CSCO,IBM,INTC,MSFT,MU,NVDA,ORCL,...,AMAT,AMD,CSCO,IBM,INTC,MSFT,MU,NVDA,ORCL,WDC
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1984-12-31,0.413,0.1617,,,,0.3688,,2.6757,,,...,2291200.0,,,,27748800.0,,807000.0,,,130600.0
1985-01-02,0.395,0.1572,14.496,,13.3649,0.3622,,2.6634,,,...,7052800.0,435800.0,,3490000.0,27259200.0,,1174000.0,,,12800.0
1985-01-03,0.402,0.1572,14.4335,,13.2545,0.3688,,2.6021,,,...,512000.0,821200.0,,5552400.0,31075200.0,,632000.0,,,99600.0
1985-01-04,0.402,0.1557,14.496,,13.2269,0.3754,,2.5039,,,...,422400.0,710000.0,,4028000.0,11688000.0,,632000.0,,,256600.0
1985-01-07,0.401,0.1527,14.3086,,13.2959,0.3853,,2.4057,,,...,4403200.0,579800.0,,4671200.0,12465600.0,,2348000.0,,,186600.0


In [47]:
df.head()

        date      
ticker  1985-01-02    IBM
        1985-01-03    IBM
        1985-01-04    IBM
        1985-01-07    IBM
        1985-01-08    IBM
dtype: object