# Lesson 3 Working with multipe stocks

## Create an empty data frame

In [1]:
import pandas as pd

def test_run():
    start_date = '2010-01-22'
    end_date = '2010-01-26'
    dates = pd.date_range(start_date, end_date)
    print(dates)
    print(dates[0])
    df1 = pd.DataFrame(index=dates)
    print(df1)


if __name__ == '__main__':
    test_run()

DatetimeIndex(['2010-01-22', '2010-01-23', '2010-01-24', '2010-01-25',
               '2010-01-26'],
              dtype='datetime64[ns]', freq='D')
2010-01-22 00:00:00
Empty DataFrame
Columns: []
Index: [2010-01-22 00:00:00, 2010-01-23 00:00:00, 2010-01-24 00:00:00, 2010-01-25 00:00:00, 2010-01-26 00:00:00]


## Join SPY data

In [2]:
import pandas as pd

def test_run():
    #Define data range
    start_date = '2010-01-22'
    end_date = '2010-01-26'
    dates = pd.date_range(start_date, end_date)
    
    #Create an empty dataframe
    df1 = pd.DataFrame(index=dates)
    
    #Read SPY data into temporary dataframe
    # this uses integer as index, but df1 uses date
    # dfSPY = pd.read_csv("data/SPY.csv") 
    # so we should specify the index_col
    dfSPY = pd.read_csv("data/SPY.csv", index_col="Date", parse_dates=True)
    
    #Join the two dataframes using DataFrame.join()
    df1 = df1.join(dfSPY)
    print(df1)
    
if __name__ == '__main__':
    test_run()

                  Open        High         Low       Close  Adj Close  \
2010-01-22  111.199997  111.739998  109.089996  109.209999  86.620804   
2010-01-23         NaN         NaN         NaN         NaN        NaN   
2010-01-24         NaN         NaN         NaN         NaN        NaN   
2010-01-25  110.209999  110.410004  109.410004  109.769997  87.065025   
2010-01-26  109.339996  110.470001  109.040001  109.309998  86.700111   

                 Volume  
2010-01-22  345942400.0  
2010-01-23          NaN  
2010-01-24          NaN  
2010-01-25  186937500.0  
2010-01-26  211168800.0  


To remove the NaN rows

In [3]:
import pandas as pd

def test_run():
    #Define data range
    start_date = '2010-01-22'
    end_date = '2010-01-26'
    dates = pd.date_range(start_date, end_date)
    
    #Create an empty dataframe
    df1 = pd.DataFrame(index=dates)
    
    #Read SPY data into temporary dataframe
    # this uses integer as index, but df1 uses date
    # dfSPY = pd.read_csv("data/SPY.csv") 
    # so we should specify the index_col
    dfSPY = pd.read_csv("data/SPY.csv", index_col="Date", 
                        parse_dates=True, usecols=['Date', 'Adj Close'],
                        na_values=['nan'])
    
    #Join the two dataframes using DataFrame.join()
    df1 = df1.join(dfSPY)
    
    #Drop NaN values
    df1 = df1.dropna()
    print(df1)
    
if __name__ == '__main__':
    test_run()

            Adj Close
2010-01-22  86.620804
2010-01-25  87.065025
2010-01-26  86.700111


## Types of "join"

We can use `inner` directly for the join parameter of the function `join` for the same purpose.

In [4]:
import pandas as pd

def test_run():
    #Define data range
    start_date = '2010-01-22'
    end_date = '2010-01-26'
    dates = pd.date_range(start_date, end_date)
    
    #Create an empty dataframe
    df1 = pd.DataFrame(index=dates)
    
    #Read SPY data into temporary dataframe
    # this uses integer as index, but df1 uses date
    # dfSPY = pd.read_csv("data/SPY.csv") 
    # so we should specify the index_col
    dfSPY = pd.read_csv("data/SPY.csv", index_col="Date", 
                        parse_dates=True, usecols=['Date', 'Adj Close'],
                        na_values=['nan'])
    
    #Join the two dataframes using DataFrame.join()
    df1 = df1.join(dfSPY, how='inner')
    
    # #Drop NaN values
    # df1 = df1.dropna()
    print(df1)
    
if __name__ == '__main__':
    test_run()

            Adj Close
2010-01-22  86.620804
2010-01-25  87.065025
2010-01-26  86.700111


## Read in more stocks

In [6]:
import pandas as pd

def test_run():
    #Define data range
    start_date = '2010-01-22'
    end_date = '2010-01-26'
    dates = pd.date_range(start_date, end_date)
    
    #Create an empty dataframe
    df1 = pd.DataFrame(index=dates)
    
    #Read SPY data into temporary dataframe
    # this uses integer as index, but df1 uses date
    # dfSPY = pd.read_csv("data/SPY.csv") 
    # so we should specify the index_col
    dfSPY = pd.read_csv("data/SPY.csv", index_col="Date", 
                        parse_dates=True, usecols=['Date', 'Adj Close'],
                        na_values=['nan'])
    
    #Rename 'Adj Close' column to 'SPY' to prevent clash
    dfSPY = dfSPY.rename(columns={'Adj Close': 'SPY'})
    
    #Join the two dataframes using DataFrame.join()
    df1 = df1.join(dfSPY, how='inner')
    
    #Read in more stocks
    symbols = ['GOOG', 'IBM', 'GLD']
    for symbol in symbols:
        df_temp = pd.read_csv("data/{}.csv".format(symbol), index_col='Date',
                             parse_dates=True, usecols=['Date', 'Adj Close'],
                             na_values=['nan'])
        # rename to prevent clash
        df_temp = df_temp.rename(columns={'Adj Close': symbol})
        
        df1 = df1.join(df_temp)
        
    print(df1)
    
if __name__ == '__main__':
    test_run()

                  SPY        GOOG        IBM         GLD
2010-01-22  86.620804  273.978058  80.869118  107.169998
2010-01-25  87.065025  268.991760  81.268616  107.480003
2010-01-26  86.700111  270.197235  81.030205  107.559998
