In [1]:
import sqlite3
import os
import pandas as pd
from datetime import datetime
import numpy as np
import os

#First make sure if the database is already there that we erase it prior to this lesson
if os.path.exists("Stock.db"):
    os.remove("Stock.db")


#Create the data
data = pd.DataFrame(list(range(1,101)) * 10)
data.columns = ["ID"]
l = []
for date in pd.date_range(datetime(2019,1,1), datetime(2019,1,10)):
    l.extend([date] * 100)
data['Date'] = l
data['Sector'] = ["Healthcare", "Financials", "Energy", "Consumer Staples", "Consumer Discretionary"] * 200
np.random.seed(1)
data['Return'] = np.random.normal(0,.01,1000)
print(data.head(5))

conn = sqlite3.connect('Stock.db')
#Set index to false so we do not add the index to the SQL table
data.to_sql("Returns", conn, index=False)

   ID       Date                  Sector    Return
0   1 2019-01-01              Healthcare  0.016243
1   2 2019-01-01              Financials -0.006118
2   3 2019-01-01                  Energy -0.005282
3   4 2019-01-01        Consumer Staples -0.010730
4   5 2019-01-01  Consumer Discretionary  0.008654


In [2]:
#We already know how to pull an average
query = "SELECT AVG(Return) FROM Returns"
df = pd.read_sql(query, conn)
print(df)

   AVG(Return)
0     0.000388


In [3]:
#GROUP BY allows you to run something like an average over a specific grouping
query = "SELECT AVG(Return) FROM Returns GROUP BY ID"
df = pd.read_sql(query, conn)
print(df)

    AVG(Return)
0     -0.001680
1      0.002803
2     -0.003813
3      0.003057
4     -0.004590
5     -0.003555
6     -0.001688
7     -0.001561
8     -0.000292
9      0.003888
10     0.002635
11    -0.001818
12     0.004797
13    -0.004476
14     0.003047
15    -0.001718
16     0.001358
17    -0.001475
18     0.002744
19    -0.002742
20     0.001226
21     0.000562
22     0.005691
23     0.002663
24    -0.000880
25     0.004959
26     0.002273
27    -0.000344
28    -0.000660
29     0.001787
..          ...
70    -0.001657
71     0.001088
72     0.006655
73     0.004894
74     0.002682
75    -0.001377
76    -0.002454
77    -0.002681
78     0.000419
79     0.002018
80     0.001837
81    -0.004058
82    -0.003919
83     0.005823
84     0.005576
85    -0.003824
86     0.002121
87     0.004864
88    -0.000601
89    -0.003235
90    -0.002409
91     0.002454
92    -0.001091
93     0.003813
94    -0.000996
95    -0.001327
96    -0.001118
97    -0.000082
98    -0.001981
99     0.003341

[100 ro

In [4]:
#But we also want to make sure that we are getting the column back that we are grouping on
query = "SELECT ID, AVG(Return) FROM Returns GROUP BY ID"
df = pd.read_sql(query, conn)
print(df)

     ID  AVG(Return)
0     1    -0.001680
1     2     0.002803
2     3    -0.003813
3     4     0.003057
4     5    -0.004590
5     6    -0.003555
6     7    -0.001688
7     8    -0.001561
8     9    -0.000292
9    10     0.003888
10   11     0.002635
11   12    -0.001818
12   13     0.004797
13   14    -0.004476
14   15     0.003047
15   16    -0.001718
16   17     0.001358
17   18    -0.001475
18   19     0.002744
19   20    -0.002742
20   21     0.001226
21   22     0.000562
22   23     0.005691
23   24     0.002663
24   25    -0.000880
25   26     0.004959
26   27     0.002273
27   28    -0.000344
28   29    -0.000660
29   30     0.001787
..  ...          ...
70   71    -0.001657
71   72     0.001088
72   73     0.006655
73   74     0.004894
74   75     0.002682
75   76    -0.001377
76   77    -0.002454
77   78    -0.002681
78   79     0.000419
79   80     0.002018
80   81     0.001837
81   82    -0.004058
82   83    -0.003919
83   84     0.005823
84   85     0.005576
85   86    -0

In [5]:
#We can also do the same with grouping with date
query = "SELECT Date, AVG(Return) FROM Returns GROUP BY Date"
df = pd.read_sql(query, conn)
print(df)

                  Date  AVG(Return)
0  2019-01-01 00:00:00     0.000606
1  2019-01-02 00:00:00     0.001528
2  2019-01-03 00:00:00     0.000098
3  2019-01-04 00:00:00    -0.000204
4  2019-01-05 00:00:00     0.000643
5  2019-01-06 00:00:00     0.001256
6  2019-01-07 00:00:00    -0.000575
7  2019-01-08 00:00:00    -0.000974
8  2019-01-09 00:00:00     0.001069
9  2019-01-10 00:00:00     0.000433


In [6]:
#It is also possible to group by combinations of fields such as date and sector for example
query = "SELECT Date, Sector, AVG(Return) FROM Returns GROUP BY Date, Sector"
df = pd.read_sql(query, conn)
print(df)

                   Date                  Sector  AVG(Return)
0   2019-01-01 00:00:00  Consumer Discretionary     0.003899
1   2019-01-01 00:00:00        Consumer Staples     0.001985
2   2019-01-01 00:00:00                  Energy     0.000709
3   2019-01-01 00:00:00              Financials    -0.000629
4   2019-01-01 00:00:00              Healthcare    -0.002935
5   2019-01-02 00:00:00  Consumer Discretionary    -0.000042
6   2019-01-02 00:00:00        Consumer Staples     0.005859
7   2019-01-02 00:00:00                  Energy     0.003048
8   2019-01-02 00:00:00              Financials     0.002243
9   2019-01-02 00:00:00              Healthcare    -0.003469
10  2019-01-03 00:00:00  Consumer Discretionary    -0.001522
11  2019-01-03 00:00:00        Consumer Staples     0.001333
12  2019-01-03 00:00:00                  Energy     0.001264
13  2019-01-03 00:00:00              Financials    -0.000820
14  2019-01-03 00:00:00              Healthcare     0.000236
15  2019-01-04 00:00:00 

In [7]:
#Let's start with a basic query
query = "SELECT Sector, AVG(Return) as Return FROM Returns GROUP BY Sector"
df = pd.read_sql(query, conn)
print(df)

                   Sector    Return
0  Consumer Discretionary -0.000082
1        Consumer Staples  0.001278
2                  Energy  0.000615
3              Financials  0.000080
4              Healthcare  0.000049


In [8]:
#Turn it into a view
cur = conn.cursor()
query = "CREATE VIEW [Sector Average] AS SELECT Sector, AVG(Return) as Return FROM Returns GROUP BY Sector"
cur.execute(query)

<sqlite3.Cursor at 0x11644f490>

In [9]:
#Then you are able to pull from it
query = "SELECT * FROM [Sector Average]"
df = pd.read_sql(query, conn)
print(df)

                   Sector    Return
0  Consumer Discretionary -0.000082
1        Consumer Staples  0.001278
2                  Energy  0.000615
3              Financials  0.000080
4              Healthcare  0.000049


In [10]:
#Notice if we change the data the view also changes
query = "DELETE FROM Returns WHERE Return < 0"
cur.execute(query)

query = "SELECT * FROM [Sector Average]"
df = pd.read_sql(query, conn)
print(df)

                   Sector    Return
0  Consumer Discretionary  0.008219
1        Consumer Staples  0.007831
2                  Energy  0.007885
3              Financials  0.007660
4              Healthcare  0.008150


In [11]:
#Reset it....
data.to_sql("Returns", conn, index=False, if_exists='replace')

In [12]:
#Back to what it was before
query = "SELECT * FROM [Sector Average]"
df = pd.read_sql(query, conn)
print(df)

                   Sector    Return
0  Consumer Discretionary -0.000082
1        Consumer Staples  0.001278
2                  Energy  0.000615
3              Financials  0.000080
4              Healthcare  0.000049


In [13]:
#The IN keyword can be used to find if a column contains one of any values given
#Below will find the average return for the consumer sectors
query = "SELECT AVG(Return) FROM [Sector Average] WHERE Sector IN ('Consumer Discretionary', 'Consumer Staples')"
df = pd.read_sql(query, conn)
print(df)

   AVG(Return)
0     0.000598


In [14]:
#Queries can be put inside each other 

#Query 1: Get average sector returns
query = "SELECT Sector, AVG(Return) as Return FROM Returns GROUP BY Sector"
df = pd.read_sql(query, conn)
print(df)
print()
print()

#Query 2: Pull from the first query in parantheses and choose the sector wherever
#the average sector return is greater than 0
query = "SELECT Sector FROM (SELECT Sector, AVG(Return) as Return FROM Returns GROUP BY Sector) WHERE Return > 0"
df = pd.read_sql(query, conn)
print(df)

                   Sector    Return
0  Consumer Discretionary -0.000082
1        Consumer Staples  0.001278
2                  Energy  0.000615
3              Financials  0.000080
4              Healthcare  0.000049


             Sector
0  Consumer Staples
1            Energy
2        Financials
3        Healthcare


In [15]:
#This is an easier way to visualize, the first query being put into the second one at the parantheses
query1 = "SELECT Sector, AVG(Return) as Return FROM Returns GROUP BY Sector"
query2 = "SELECT Sector FROM ({}) WHERE Return > 0".format(query1)
df = pd.read_sql(query2, conn)
print(df)

             Sector
0  Consumer Staples
1            Energy
2        Financials
3        Healthcare


In [16]:
#We are also able to replicate the same concept using HAVING
#HAVING will deal with conditions after aggregation, so the following looks for where the grouped sector return
#average is positive
query2 = "SELECT Sector FROM Returns GROUP BY Sector HAVING AVG(Return) > 0"
df = pd.read_sql(query2, conn)
print(df)

             Sector
0  Consumer Staples
1            Energy
2        Financials
3        Healthcare


In [17]:
#Look for the number of dates which are positive, by using WHERE before the GROUP BY
#we are applying it to the individual records
query3 = "SELECT Sector, COUNT(Return) as Positive FROM Returns WHERE Return>0 GROUP BY Sector"
df = pd.read_sql(query3, conn)
print(df)

                   Sector  Positive
0  Consumer Discretionary        96
1        Consumer Staples       113
2                  Energy       106
3              Financials       105
4              Healthcare        94


In [18]:
#Combining query2 and query3 would give us the number of positive days but filters out any sectors that
#have a negative average return
query4 = "SELECT Sector, Positive FROM ({}) WHERE Sector IN ({})".format(query3, query2)
df = pd.read_sql(query4, conn)
print(df)

             Sector  Positive
0  Consumer Staples       113
1            Energy       106
2        Financials       105
3        Healthcare        94


In [19]:
#You can see what the words actually look like
print(query4)

SELECT Sector, Positive FROM (SELECT Sector, COUNT(Return) as Positive FROM Returns WHERE Return>0 GROUP BY Sector) WHERE Sector IN (SELECT Sector FROM Returns GROUP BY Sector HAVING AVG(Return) > 0)


In [20]:
#BETWEEN can be used to limit values to a specific range
query = "SELECT Date, AVG(Return) as Return FROM Returns WHERE Date BETWEEN '2019-01-01 00:00:00' AND '2019-01-03 00:00:00' GROUP BY Date"
df = pd.read_sql(query, conn)
print(df)
print()
print()

                  Date    Return
0  2019-01-01 00:00:00  0.000606
1  2019-01-02 00:00:00  0.001528
2  2019-01-03 00:00:00  0.000098


