Aggregating Data with Pandas

In [1]:
import pandas as pd
# import numpy as py

earthquakesdf = pd.read_csv('earthquakes.csv')
earthquakesdf


Unnamed: 0,mag,magType,time,place,tsunami,parsed_place
0,1.35,ml,1539475168010,"9km NE of Aguanga, CA",0,California
1,1.29,ml,1539475129610,"9km NE of Aguanga, CA",0,California
2,3.42,ml,1539475062610,"8km NE of Aguanga, CA",0,California
3,0.44,ml,1539474978070,"9km NE of Aguanga, CA",0,California
4,2.16,md,1539474716050,"10km NW of Avenal, CA",0,California
...,...,...,...,...,...,...
9327,0.62,md,1537230228060,"9km ENE of Mammoth Lakes, CA",0,California
9328,1.00,ml,1537230135130,"3km W of Julian, CA",0,California
9329,2.40,md,1537229908180,"35km NNE of Hatillo, Puerto Rico",0,Puerto Rico
9330,1.10,ml,1537229545350,"9km NE of Aguanga, CA",0,California


In [2]:
#Filtering the column magType to find the 'ml' using query()
earthquakes_ml = earthquakesdf.query('magType == "ml" ')
earthquakes_ml

Unnamed: 0,mag,magType,time,place,tsunami,parsed_place
0,1.35,ml,1539475168010,"9km NE of Aguanga, CA",0,California
1,1.29,ml,1539475129610,"9km NE of Aguanga, CA",0,California
2,3.42,ml,1539475062610,"8km NE of Aguanga, CA",0,California
3,0.44,ml,1539474978070,"9km NE of Aguanga, CA",0,California
6,1.70,ml,1539473176017,"105km W of Talkeetna, Alaska",0,Alaska
...,...,...,...,...,...,...
9325,0.51,ml,1537230344890,"4km WNW of Julian, CA",0,California
9326,1.82,ml,1537230230260,"4km W of Julian, CA",0,California
9328,1.00,ml,1537230135130,"3km W of Julian, CA",0,California
9330,1.10,ml,1537229545350,"9km NE of Aguanga, CA",0,California


In [3]:
#Creating bins using the filtered dataset

minMag = int(earthquakesdf['mag'].min())
maxMag = int(earthquakesdf['mag'].max())

bins = [i for i in range(minMag, maxMag + 2)]
bins

[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8]

In [4]:
#Creating the bins for min and max
magCounts = pd.cut(earthquakesdf['mag'], bins= bins, right= False).value_counts()
magCounts


mag
[1, 2)     3827
[0, 1)     2802
[2, 3)     1277
[4, 5)      536
[-1, 0)     477
[3, 4)      247
[5, 6)      139
[6, 7)       10
[7, 8)        2
Name: count, dtype: int64

In [5]:
fngCsv = pd.read_csv('faang.csv', parse_dates=['date'])
fngCsv .set_index('date', inplace= True)
fngCsv

Unnamed: 0_level_0,ticker,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,FB,177.68,181.58,177.5500,181.42,18151903
2018-01-03,FB,181.88,184.78,181.3300,184.67,16886563
2018-01-04,FB,184.90,186.21,184.0996,184.33,13880896
2018-01-05,FB,185.59,186.90,184.9300,186.85,13574535
2018-01-08,FB,187.20,188.90,186.3300,188.28,17994726
...,...,...,...,...,...,...
2018-12-24,GOOG,973.90,1003.54,970.1100,976.22,1590328
2018-12-26,GOOG,989.01,1040.00,983.0000,1039.46,2373270
2018-12-27,GOOG,1017.15,1043.89,997.0000,1043.88,2109777
2018-12-28,GOOG,1049.62,1055.56,1033.1000,1037.08,1413772


In [6]:
# Grouping by 'ticker' and resampling to monthly end frequency
resmpled=fngCsv.resample('ME').first()
resmpled

Unnamed: 0_level_0,ticker,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-31,FB,177.68,181.58,177.55,181.42,18151903
2018-02-28,FB,188.22,195.32,187.89,193.09,54211293
2018-03-31,FB,179.01,180.12,174.41,175.94,23201626
2018-04-30,FB,157.81,159.2,154.111,155.39,36795991
2018-05-31,FB,172.0,174.02,170.23,173.86,26025932
2018-06-30,FB,193.065,194.5492,192.07,193.99,17307245
2018-07-31,FB,193.37,197.45,192.22,197.36,13961578
2018-08-31,FB,173.93,175.08,170.9,171.65,34042109
2018-09-30,FB,173.5,173.89,168.8,171.16,29808971
2018-10-31,FB,163.03,165.88,161.26,162.44,26407677


In [7]:
import statistics as stat

#Finding the mean of the open price column

openCols = fngCsv['open']
mean = stat.mean(openCols)
print(mean)

687.1480807968128


In [8]:
#Finding the Highest price in the data set

maxPrice = fngCsv['high'].max()
maxPrice

2050.5

In [9]:
#Finding the Lowest price in the data set

minPrice = fngCsv['low'].min()
minPrice

123.02

In [10]:
#Finding the mean of the closing price
closeCols = fngCsv['close']
newMean = stat.mean(closeCols)
print(newMean)

686.5477534661354


In [11]:
fngCsv['volume'].sum()

20223555523

In [12]:
#Building a crosstab between the magType and tsunami column

crossTab = pd.crosstab(index = earthquakesdf['tsunami'], 
                       columns = earthquakesdf['magType'], 
                       values=earthquakesdf['mag'], aggfunc='max')
crossTab


magType,mb,mb_lg,md,mh,ml,ms_20,mw,mwb,mwr,mww
tsunami,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5.6,3.5,4.11,1.1,4.2,,3.83,5.8,4.8,6.0
1,6.1,,,,5.1,5.7,4.41,,,7.5


In [13]:
"""
Applying the 60D rolling function using the same aggregations found in no. 3
for each OHLC and ticker using rolling() and agg()

"""
rollStats = fngCsv.groupby('ticker').apply(lambda x: x.rolling(window=60).agg({
    'open': 'mean',
    'high': 'max',
    'low': 'min',
    'close': 'mean',
    'volume': 'sum'
}))

rollStats


  rollStats = fngCsv.groupby('ticker').apply(lambda x: x.rolling(window=60).agg({


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2018-01-02,,,,,
AAPL,2018-01-03,,,,,
AAPL,2018-01-04,,,,,
AAPL,2018-01-05,,,,,
AAPL,2018-01-08,,,,,
...,...,...,...,...,...,...
NFLX,2018-12-24,306.018050,386.7999,233.68,303.239833,811001766.0
NFLX,2018-12-26,303.596050,386.7999,231.23,301.232167,818289623.0
NFLX,2018-12-27,301.500383,386.7999,231.23,299.134417,822148280.0
NFLX,2018-12-28,299.393050,380.9300,231.23,297.116750,824496849.0


In [15]:
#Creating pivot table

pivotTable = pd.pivot_table(data= fngCsv, index= ['ticker'])
pivotTable

Unnamed: 0_level_0,close,high,low,open,volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAPL,186.986218,188.906858,185.135729,187.038674,34021450.0
AMZN,1641.726175,1662.839801,1619.840398,1644.072669,5649563.0
FB,171.510936,173.615298,169.30311,171.454424,27687980.0
GOOG,1113.225139,1125.777649,1101.001594,1113.554104,1742645.0
NFLX,319.290299,325.224583,313.187273,319.620533,11470300.0


In [54]:
#Calculating the z-scores of NFLX
import numpy as np
import scipy.stats as stats

#
pivotTable.loc['NFLX', ['close', 'high', 'low', 'open']].apply(lambda x : stats.zscore(x))


AxisError: axis 0 is out of bounds for array of dimension 0

In [64]:
#Since mine kept on saying that there were no data found (dimension 0) I tried getting the zscores of the OHLC for each ticker
zScores = pivotTable.loc[:, ['close', 'high', 'low', 'open']].apply(lambda x : stats.zscore(x))
zScores #Stored the zscore data here

Unnamed: 0_level_0,close,high,low,open
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,-0.845835,-0.84701,-0.844539,-0.845665
AMZN,1.617265,1.618472,1.615403,1.618121
FB,-0.872037,-0.872589,-0.871685,-0.872017
GOOG,0.722431,0.720115,0.725803,0.721035
NFLX,-0.621824,-0.618988,-0.624982,-0.621474


In [65]:
#After that I showed the row in which all the OHLC of the NFLX ticker has its own zscores.
zNFLX = zScores.loc['NFLX']
zNFLX

close   -0.621824
high    -0.618988
low     -0.624982
open    -0.621474
Name: NFLX, dtype: float64

In [68]:
#Creating a new dataframe

newD = {'ticker' : 'FB' , 
        'date' : ['2018-07-25' , '2018-03-19' , '2018-03-20'], 
        'event' : ['Disappointing user growth announced after close' , 'Cambridge Analytica Story', 'FTC investigation']}
newDataFrame = pd.DataFrame(data= newD).set_index(['date' , 'ticker'])
newDataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,event
date,ticker,Unnamed: 2_level_1
2018-07-25,FB,Disappointing user growth announced after close
2018-03-19,FB,Cambridge Analytica Story
2018-03-20,FB,FTC investigation


In [72]:
newDataFrame.iloc['ticker']

TypeError: Cannot index by location index with a non-integer key

In [76]:
newDataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,event
date,ticker,Unnamed: 2_level_1
2018-07-25,FB,Disappointing user growth announced after close
2018-03-19,FB,Cambridge Analytica Story
2018-03-20,FB,FTC investigation


In [77]:
#Using Outer join
resultDf = pd.merge(newDataFrame, rollStats, on = 'ticker', how = 'outer')
resultDf

Unnamed: 0_level_0,event,open,high,low,close,volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,,,,,,
AAPL,,,,,,
AAPL,,,,,,
AAPL,,,,,,
AAPL,,,,,,
...,...,...,...,...,...,...
NFLX,,306.018050,386.7999,233.68,303.239833,811001766.0
NFLX,,303.596050,386.7999,231.23,301.232167,818289623.0
NFLX,,301.500383,386.7999,231.23,299.134417,822148280.0
NFLX,,299.393050,380.9300,231.23,297.116750,824496849.0


In [79]:
transFormedData = fngCsv.groupby('ticker').transform(lambda x: x / x.iloc[0])

print(transFormedData)

                open      high       low     close    volume
date                                                        
2018-01-02  1.000000  1.000000  1.000000  1.000000  1.000000
2018-01-03  1.023638  1.017623  1.021290  1.017914  0.930292
2018-01-04  1.040635  1.025498  1.036889  1.016040  0.764707
2018-01-05  1.044518  1.029298  1.041566  1.029931  0.747830
2018-01-08  1.053579  1.040313  1.049451  1.037813  0.991341
...              ...       ...       ...       ...       ...
2018-12-24  0.928993  0.940578  0.928131  0.916638  1.285047
2018-12-26  0.943406  0.974750  0.940463  0.976019  1.917695
2018-12-27  0.970248  0.978396  0.953857  0.980169  1.704782
2018-12-28  1.001221  0.989334  0.988395  0.973784  1.142383
2018-12-31  1.002499  0.986653  0.979296  0.972404  1.206986

[1255 rows x 5 columns]
