# Exercises: Chapter 4

In [None]:
import pandas as pd
import numpy as np

#### 1. With the `earthquakes.csv` file, select all the earthquakes in Japan with a magnitude of 4.9 or greater using the mb magnitude type.

In [None]:
!ls exercises/

In [None]:
!head -5 exercises/earthquakes.csv

In [None]:
quakes = pd.read_csv('exercises/earthquakes.csv')

In [None]:
quakes.head()

In [None]:
quakes.dtypes

In [None]:
quakes.shape

In [None]:
quakes.loc[lambda x: (x.mag >= 4.9) & (x.magType == 'mb') & (x.parsed_place == 'Japan') ]

In [None]:
quakes[quakes.mag.ge(4.9) & (quakes.magType.eq('mb')) & (quakes.parsed_place.eq('Japan'))]

In [None]:
quakes[quakes.mag.ge(4.9) & quakes.magType.eq('mb')]

In [None]:
quakes.query('(mag >= 4.9) and (magType == "mb") and (parsed_place == "Japan")')

#### 2. Create bins for each full number of earthquake magnitude (for instance, the first bin is (0, 1], the second is (1, 2], and so on) with the ml magnitude type and count how many are in each bin.

In [None]:
##??pd.cut

In [None]:
ml_mags = (
    quakes
    .query('magType == "ml"')['mag']
    .to_frame()
)

# find min and max values
# agg might not work on a series
ml_mags.agg({'mag':['min', 'max']}).T

In [None]:
ml_mag_bins = pd.cut(ml_mags.squeeze(), bins = range(-2,7))
ml_mag_bins

In [None]:
ml_mag_bins.value_counts()

In [None]:
ml_mag_bins.value_counts(normalize=True)

#### 3. Using the faang.csv file, group by the ticker and resample to monthly frequency. Make the following aggregations:

a) Mean of opening price

b) Maximum of the high price

c) Minimum of the low price

d) Mean of the closing price

e) Sum of the volume traded

In [None]:
!head -5 exercises/faang.csv

In [None]:
faang = pd.read_csv('exercises/faang.csv')
faang.head()

In [None]:
faang.shape

In [None]:
faang.info() # to group by month, want to make date the index as a datetime

In [None]:
faang = pd.read_csv('exercises/faang.csv', index_col='date', parse_dates=True)
faang.head()

In [None]:
faang.info()

In [None]:
( faang
 .groupby('ticker')
 .resample('M')
 .agg({
     'open':'mean',
     'high':'max',
     'low':'min',
     'close':'mean',
     'volume':'sum'})
)

#### 4. Build a crosstab with the earthquake data between the tsunami column and the magType column. Rather than showing the frequency count, show the maximum magnitude that was observed for each combination. Put the magnitude type along the columns.


In [None]:
quakes.head()

In [None]:
pd.crosstab(index=quakes.magType, columns=quakes.tsunami, values=quakes.mag, aggfunc=max )

In [None]:
pd.crosstab(index=quakes.tsunami, columns= quakes.magType, values=quakes.mag, aggfunc=max )

In [None]:
quakes.query('magType=="mwb" and tsunami==0')['mag'].max()

#### 5. Calculate the rolling 60-day aggregations of the OHLC data by ticker for the FAANG data. Use the same aggregations as exercise 3.

In [None]:
( faang
 .groupby('ticker')
 .rolling('60D')
 .agg({
     'open':'mean',
     'high':'max',
     'low':'min',
     'close':'mean',
     'volume':'sum'})
)

#### <div class="alert alert-info"><span style="color:green">    ***INTERESTING, CONSEQUENTIAL ERROR ON MY PART. I ORIGINALLY SPECIFIED `rolling(60)`, WHICH WILL DO THE LAST 60 OBSERVATIONS. WHICH IS A SPAN OF MORE THAN 60 DAYS WITH TRADING DATA. I SHOULD HAVE SPECIFIED `rolling('60D')`.*** </span></div>

 #### 6. Create a pivot table of the FAANG data that compares the stocks. Put the ticker in the rows and show the averages of the OHLC and volume traded data.

In [None]:
faang.head()

In [None]:
pd.pivot_table(faang, index='ticker', aggfunc='mean', values=['open', 'high', 'low', 'close', 'volume'])

#### 7. Calculate the Z-scores for each numeric column of Amazon's data (ticker is AMZN) in Q4 2018 using `apply()`.

In [None]:
(
    faang
    .query('ticker=="AMZN"')
    .loc['2018-Q4']
    .drop(columns='ticker')
    .apply(
        lambda x: x.sub(x.mean()).div(x.std())
    )
)
    

#### 8. Add event descriptions:   
a) Create a dataframe with the following three columns: ticker, date, and event. The columns should have the following values:    
> i) ticker: 'FB'    
ii) date: ['2018-07-25', '2018-03-19', '2018-03-20']    
iii) event: ['Disappointing user growth announced after close.', 'Cambridge Analytica story', 'FTC investigation']

In [None]:
faang.head(3)

In [None]:
event_df = pd.DataFrame({
    'ticker': 'FB',
    'date':  ['2018-07-25', '2018-03-19', '2018-03-20'],
    'event': ['Disappointing user growth announced after close.', 'Cambridge Analytica story', 'FTC investigation']
    })

event_df.head()

In [None]:
event_df.info() # first time through, I hadn't realized `date` wasn't a datetime in this dataframe

In [None]:
# make date datetime
event_df['date'] = pd.to_datetime(event_df['date'])
event_df.info()

In [None]:
# put date and ticker into the index
event_df = event_df.set_index(['date', 'ticker'])

In [None]:
event_df

- event_df has a multi index
- faang has date only in the index
- Here's how to move `ticker` into faang's index w/out replacing `date', but rather augmenting it

In [None]:
faang.info() # faang's index IS datetime

In [None]:
# date was already in the index, reset_index kicks the existing index into the values
# to ADD ticker to the index, need to specify append=True

faang.set_index('ticker', append=True).sample(n=5)

In [None]:
faang_w_events= (
    faang
    .set_index('ticker', append=True) # add ticker to date in the index
    .merge(
        event_df, #needs to have date and ticker in the index...see above
        how='outer',
        left_index=True, # to merge on an index as opposed to a dataframe column
        right_index=True # ditto
    )
)

faang_w_events.head()

In [None]:
event_df.index

In [None]:
faang_w_events.loc[[x[0]  for x in event_df.index], :] #just picks out the dates to confirm they're in the merged dataframe

#### <span style="color:green"> 9. Use the `transform()` method on the FAANG data to represent all the values in terms of the first date in the data. To do so, divide all the values for each ticker by the values for the first date in the data for that ticker. This is referred to as an index, and the data for the first date is the base (https://ec.europa.eu/eurostat/statistics-explained/index.php/Beginners:Statistical_concept_-_Index_and_base_year). When data is in this format, we can easily see growth over time. Hint: transform() can take a function name.</span>

#### <div class="alert alert-info"><span style="color:green">    ***Very, very useful exercise. Worth reviewing in the future and documenting in Notion.*** </span></div>

In [None]:
faang.groupby('ticker').head(2)

In [None]:
faang.shape

##### <span style="color:green" >To get the denominator for this calculation, we need to:</span>
##### <span style="color:green" >- Group by ticker</span>      
##### <span style="color:green" >- Find the first valid value for each ticker for each series</span>

##### <span style="color:green" >The first part isn't terribly difficult. Group by `ticker` and then get the `first` value for each series and use `transform()` to have it applied to each date for each ticker </span>

In [None]:
(
    faang
    .groupby('ticker')
    .transform('first')
    .loc[lambda x: x.index <= '2018-01-04']
    
)

In [None]:
(
    faang
    .set_index('ticker', append=True)
    .groupby('ticker')
    .transform('first')
)

In [None]:
# (
#     faang
#     .reset_index().set_index(['ticker', 'date'])
#     .groupby('ticker')
#     .transform('first')
# )

##### <span style="color:green" > When I saw this I had two thoughts:  </span>
##### <span style="color:green" >- This problem might not be that hard...divide all the values in `faang` by the values above...and we're done!</span>      
##### <span style="color:green" >- But....the `groupby` I did above has lost the tickers. Even if the calculations were correct, losing the ticker information makes the data close to useless in a practical sense and it also makes the accuracy difficult to confirm. </span>

##### <span style="color:green" > Still...I went ahead and performed the calculation to make sure the general method would work. I had to drop the `ticker` column to make the columns lined up, but this *looks* like the calculation you'd want.   </span>

In [None]:
faang.drop(columns='ticker').div(faang.groupby('ticker').transform('first'))

##### <span style="color:green" > To make this work, before doing any of the grouping and transforming...I needed to move the ticker into index along w/the date...</span>     
##### <span style="color:green" > ...`set_index` to `ticker` would drop `date` and replace it with `ticker`. So I had to find the `append=True` option for `set_index` to put both `date` and `ticker` in the index.</span>     

In [None]:
faang.set_index('ticker', append=True).sample(10)

#### <span style="color:green" > Repeating what I did above, but with `date` and `ticker` in the index was straightforward.</span>    

In [None]:
(
    faang
    .set_index('ticker', append=True)
    .apply(
        lambda x: x.groupby(['ticker']).transform('first')
    )
)
        
        

##### <span style="color:green" > Finally, I put it all together:</span>    
##### <span style="color:green" > - The numerator is just `faang` with both `date` and `ticker` in index.</span>    
##### <span style="color:green" > - The denominators are the first values for each ticker/series that we created above, again with both `date` and `ticker` in the index.</span>    

In [None]:
faang_indexed = \
(faang
 .set_index('ticker', append=True) # put both `date` and `ticker` into faang's index
 .div(faang                                  # divide by the first value of each ticker, using what we did in the cell above
      .set_index('ticker', append=True)
      .apply(lambda x: x.groupby(['ticker']).transform('first'))
     )
)

faang_indexed.head()


##### <span style="color:green" > (After reviewing things, a much more intuitive solution is below:)</span>  

In [None]:
(faang
 .set_index('ticker', append=True) # creates the same multi-index w/date and ticker in the index as the denominator
 .div(
     faang
    .set_index('ticker', append=True)
    .groupby('ticker')
    .transform('first')
 )
).query('date <= "2018-01-04"')


#### <span style="color:green" > So I did it! </span>  
#### <span style="color:green" > But I also needed to confirm it worked.  </span>    
#### <span style="color:green" > One thing that needed to be true if this solution was correct was that the first date for each ticker for each series would be equal to 1.0000. </span> 
#### <span style="color:green" > Doing this with regular indexing wasn't very easy. What I started with is further down.  </span>  
#### <span style="color:green" > However, doing this with `query` turns out to be blessedly easy and straightforward. See the code and output below. </span>  
#### <span style="color:green" > ALSO NOTE THAT `first` PROBABLY DEPENDS ON THE DATA BEING SORTED BY `date`.  </span>  

#### <span style="color:green" > `query()` makes it very easy to select rows, in the index or from the columns.  </span>  
#### <span style="color:green" > (Apparently, selecting columns with `query()` is strongly discouraged.)  </span>  

### <span style="color:green" > **Pick out single values of `date` and `ticker`.** </span>  

In [None]:
faang_indexed.loc["12-17-18", "AMZN"]

#### <span style="color:green" > This works as well. The whole query clause has to be placed in quotes and each conditional needs to be placed in parentheses () in order to avoid confusion on the part of python regarding operator precedence.  </span>  

In [None]:
faang_indexed.query('(date == "12-17-18") & (ticker == "AMZN")')

### <span style="color:green" > **Pick out multiple values of `date` and `ticker`.** </span>  

##### <span style="color:green" > Key details: </span>  
##### <span style="color:green" > - The index choices must be enclosed in parentheses </span>  
##### <span style="color:green" > - The indexing for both the index and the columns MUST be included </span>  
##### <span style="color:green" > - In this example, that's why `(("12-17-18", "12-21-18"), ("AMZN", "GOOG"))` is followed by `, :`. The latter is the specification of "all columns" </span>  

In [None]:
faang_indexed.loc[(("12-17-18", "12-21-18"), ("AMZN", "GOOG")), :]

##### <span style="color:green" > A *nearly* equivalent formulation with `query()`.</span>  

In [None]:
faang_indexed.query('(date in  ["12-17-18", "12-21-18"]) & (ticker in ["AMZN", "GOOG"])')

##### <span style="color:green" > Adding `sort_index` addresses the difference.</span> 

In [None]:
faang_indexed.query('(date in  ["12-17-18", "12-21-18"]) & (ticker in ["AMZN", "GOOG"])').sort_index(level=0, axis=0)

### <span style="color:green" > **Pick out multiple `date` values but ALL `ticker`s.** </span>  

In [None]:
faang_indexed.loc[(("12-17-18", "12-19-18"), slice(None)), :]

##### <span style="color:green" > `IndexSlice` allows you to stop using `Slice` and parentheses and return to using the more natural `:` and `[]`s .</span> 

In [None]:
idx = pd.IndexSlice
faang_indexed.loc[ idx[["12-17-18", "12-19-18"], :], idx[:]]

In [None]:
faang_indexed.loc[ idx[["12-17-18", "12-19-18"], :], idx['high':'open']]

##### <span style="color:green" > The `query` version of this is probably the easiest of them all, but would require sorting the index to produce the same ordering at the other two solutions: </span> 

In [None]:

faang_indexed.query('date in("12-17-18", "12-19-18")')

### <span style="color:green" > **Pick out a `date` range.** </span>  

##### <span style="color:green" > Interestingly...this will NOT work: </span> 

In [802]:
faang_indexed.loc[(slice("2018-01","2018-03"), slice(None)), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,AAPL,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,AMZN,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,FB,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,GOOG,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,NFLX,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...
2018-03-29,AAPL,0.996808,0.986057,0.986189,0.973993,1.502530
2018-03-29,AMZN,1.223084,1.166329,1.199659,1.217265,4.669178
2018-03-29,FB,0.888975,0.868150,0.873199,0.880774,3.274274
2018-03-29,GOOG,0.977562,0.959502,0.964983,0.968817,2.203297


##### <span style="color:green" > `slice`ing on dates requires the index to be sorted </span> 

In [794]:
faang_indexed.sort_values(by=['date', 'ticker'], inplace=True)
faang_indexed.loc[(slice("2018-01","2018-03"), slice(None)), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,AAPL,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,AMZN,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,FB,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,GOOG,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,NFLX,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...
2018-03-29,AAPL,0.996808,0.986057,0.986189,0.973993,1.502530
2018-03-29,AMZN,1.223084,1.166329,1.199659,1.217265,4.669178
2018-03-29,FB,0.888975,0.868150,0.873199,0.880774,3.274274
2018-03-29,GOOG,0.977562,0.959502,0.964983,0.968817,2.203297


##### <span style="color:green" > IndexSlicer again lets us use clearer syntax, or at least similar to other Pandas and Python syntax </span> 

In [796]:
faang_indexed.loc[idx["2018-01" : "2018-03", :],  idx[:]]

Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,AAPL,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,AMZN,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,FB,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,GOOG,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,NFLX,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...
2018-03-29,AAPL,0.996808,0.986057,0.986189,0.973993,1.502530
2018-03-29,AMZN,1.223084,1.166329,1.199659,1.217265,4.669178
2018-03-29,FB,0.888975,0.868150,0.873199,0.880774,3.274274
2018-03-29,GOOG,0.977562,0.959502,0.964983,0.968817,2.203297


##### <span style="color:green" > This might be a case where `query()` isn't as handy as it was in other applications. There may be a way to get it to handle date shorthand like "2018-Q1" or  "2018-01":"2018-03", but I couldn't find a good reference for how to do it. </span> 

In [803]:
start_date = '2018-01-01'
end_date = '2018-03-31'
faang_indexed.query('(date >= @start_date) & (date <= @end_date)')

Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,AAPL,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,AMZN,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,FB,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,GOOG,1.000000,1.000000,1.000000,1.000000,1.000000
2018-01-02,NFLX,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...
2018-03-29,AAPL,0.996808,0.986057,0.986189,0.973993,1.502530
2018-03-29,AMZN,1.223084,1.166329,1.199659,1.217265,4.669178
2018-03-29,FB,0.888975,0.868150,0.873199,0.880774,3.274274
2018-03-29,GOOG,0.977562,0.959502,0.964983,0.968817,2.203297


#### 10. The European Centre for Disease Prevention and Control (ECDC) provides an open dataset on COVID-19 cases called daily number of new reported cases of COVID-19 by country worldwide (https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide). This dataset is updated daily, but we will use a snapshot that contains data through September 18, 2020. Complete the following tasks to practice the skills you've learned up to this point in the book: 

a) Prepare the data:   
i) Read in the data in the covid19_cases.csv file.   
ii) Create a date column by parsing the dateRep column into a datetime.    
iii) Set the date column as the index.     
iv) Use the replace() method to update all occurrences of United_States_of_America and United_Kingdom to USA and UK, respectively.      
v) Sort the index.     
b) For the five countries with the most cases (cumulative), find the day with the largest number of cases.


In [None]:
!head -5 ~/Downloads/data.csv

In [None]:
covid = pd.read_csv('~/Downloads/data.csv', index_col='dateRep', parse_dates=True, dayfirst=True)
covid.head()

In [None]:
covid.info()

In [None]:
#?pd.read_csv

In [None]:
#??pd.DataFrame.replace

#### The instructions say the data is through 9/18/2020, so we'll cap the new data at that date so my results can match hers

In [None]:
covid.shape

In [None]:
covid = covid.query('dateRep <= "2020-09-18"')
covid.shape

In [None]:
covid.query('countriesAndTerritories in ["United_States_of_America", "United_Kingdom"]').sample(8)

In [None]:
covid_u = covid.replace({'United_States_of_America':'USA', 'United_Kingdom':'UK'})

In [None]:
covid_u.query('countriesAndTerritories in ["United_States_of_America", "United_Kingdom"]')

In [None]:
covid_u.sort_index(inplace=True)

In [None]:
covid_u.head()

In [None]:
covid_u.tail()

In [None]:
covid_u.index.max()

In [None]:
covid_5=\
(covid
 .assign(
     
     tot_cases = lambda x: x.groupby('countriesAndTerritories').cases.transform('sum'),
     case_rank = lambda x: x.tot_cases.rank(method='dense', ascending=False)
     
 ).query('case_rank <= 5')
)

In [None]:
covid_5.groupby('countriesAndTerritories').cases.max()

In [None]:
covid_5.groupby('countriesAndTerritories').cases.idxmax()

In [None]:
covid_5.groupby('countriesAndTerritories').cases.idxmax()

In [None]:
covid.groupby('countriesAndTerritories').cases.sum().nlargest(5)