In [9]:
import pandas as pd
import sqlite3

## 1. Put the database in the data subfolder in the src directory.
## 2. Create a connection to the database using the sqlite3 library.

In [10]:
db_path = "../data/checking-logs.sqlite"
conn = sqlite3.connect(db_path)

## 3. Get the schema of the pageviews table using pd.io.sql.read_sql() and the query "PRAGMA table_info(pageviews);".

In [11]:
schema_pageviews = pd.io.sql.read_sql("PRAGMA table_info(pageviews);", conn)
schema_pageviews

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,uid,TEXT,0,,0
2,2,datetime,TIMESTAMP,0,,0


## 4. Get only the first ten rows of the pageviews table to see what the table looks like.

In [12]:
pd.io.sql.read_sql("SELECT * FROM pageviews LIMIT 10", conn)

Unnamed: 0,index,uid,datetime
0,0,admin_1,2020-04-17 12:01:08.463179
1,1,admin_1,2020-04-17 12:01:23.743946
2,2,admin_3,2020-04-17 12:17:39.287778
3,3,admin_3,2020-04-17 12:17:40.001768
4,4,admin_1,2020-04-17 12:27:30.646665
5,5,admin_1,2020-04-17 12:35:44.884757
6,6,admin_1,2020-04-17 12:35:52.735016
7,7,admin_3,2020-04-17 12:36:21.401412
8,8,admin_3,2020-04-17 12:36:22.023355
9,9,admin_1,2020-04-17 13:55:19.129243


## 5. Get the subtable using one query where:
- Only "uid" and "datetime" are used.
- Only user data (user_*), not admin data, is used.
- It is sorted by "uid" in ascending order.
- The index column is "datetime".



In [13]:
pageviews = pd.io.sql.read_sql("SELECT uid, datetime FROM pageviews WHERE uid LIKE 'user_%' ORDER BY uid ASC;", conn, parse_dates=['datetime'])

- "datetime" is converted to a DatetimeIndex.


In [14]:
pageviews.set_index("datetime", inplace=True)

- The name of the dataframe is pageviews.

In [15]:
pageviews.index = pd.to_datetime(pageviews.index)
pageviews

Unnamed: 0_level_0,uid
datetime,Unnamed: 1_level_1
2020-04-26 21:53:59.624136,user_1
2020-04-26 22:06:19.478143,user_1
2020-04-26 22:12:09.614497,user_1
2020-04-30 19:29:01.831635,user_1
2020-05-05 20:26:32.894852,user_1
...,...
2020-04-29 16:51:21.877630,user_30
2020-05-09 20:30:47.034282,user_30
2020-05-22 11:30:18.368990,user_5
2020-05-21 16:28:28.217529,user_9


## 6. Close the connection to the database.

In [16]:
conn.close()
pageviews.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 987 entries, 2020-04-26 21:53:59.624136 to 2020-05-21 16:36:40.915488
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uid     987 non-null    object
dtypes: object(1)
memory usage: 15.4+ KB
