# Exercise *03*: Aggregations

## Create a connection to the database using the library *sqlite3*:

In [1]:
import sqlite3

from sqlite3 import Connection

In [2]:
db_path: str = "../../data/checking_logs.sqlite"

In [3]:
conn: Connection = sqlite3.connect(db_path)

## Get the schema of the table *test*:

In [4]:
import pandas as pd

from pandas import DataFrame

In [5]:
sql_query: str = "PRAGMA table_info(test);"

In [6]:
schema: DataFrame = pd.io.sql.read_sql(
    sql_query,
    conn
)

In [7]:
# schema

## Get only the first *10* rows of the table *test* to check what the table looks like:

In [8]:
sql_query: str = """
                 SELECT
                     *
                 FROM
                     test
                 LIMIT
                     10;
                 """

In [9]:
rows: DataFrame = pd.io.sql.read_sql(
    sql_query,
    conn
)

In [10]:
# rows

## Find among all the users the minimum value of the delta between the first commit of the user and the deadline of the corresponding lab using only one query:

* Do this by joining the table with the table *deadlines*:
* The difference should be displayed in hours:
* Do not take the lab *’project1’* into account, it has longer deadlines and will be an outlier:
* The value should be stored in the dataframe `df_min` with the corresponding *uid*.

In [11]:
sql_query: str = """
                 SELECT
                     test.uid AS uid,
                     MIN((unixepoch(test.first_commit_ts) - deadlines.deadlines) / 3600) AS min_delt
                 FROM
                     test
                 INNER JOIN
                     deadlines ON test.labname = deadlines.labs
                 WHERE
                     test.labname != 'project1';
                 """

In [12]:
df_min: DataFrame = pd.io.sql.read_sql(
    sql_query,
    conn
)

In [13]:
# df_min

## Do the same thing, but for the maximum, using only one query, the dataframe name is `df_max`:

In [14]:
sql_query: str = """
                 SELECT
                     test.uid AS uid,
                     MAX((unixepoch(test.first_commit_ts) - deadlines.deadlines) / 3600) AS max_delt
                 FROM
                     test
                 INNER JOIN
                     deadlines ON test.labname = deadlines.labs
                 WHERE
                     test.labname != 'project1';
                 """

In [15]:
df_max: DataFrame = pd.io.sql.read_sql(
    sql_query,
    conn
)

In [16]:
# df_max

## Do the same thing but for the average, using only one query, this time your dataframe should not include the *uid* column, and the dataframe name is `df_avg`:

In [17]:
sql_query: str = """
                 SELECT
                     AVG((unixepoch(test.first_commit_ts) - deadlines.deadlines) / 3600) AS avg_delt
                 FROM
                     test
                 INNER JOIN
                     deadlines ON test.labname = deadlines.labs
                 WHERE
                     test.labname != 'project1';
                 """

In [18]:
df_avg: DataFrame = pd.io.sql.read_sql(
    sql_query,
    conn
)

In [19]:
# df_avg

## We want to test the hypothesis that the users who visited the *newsfeed* just a few times have the lower delta between the first commit and the deadline. To do this, you need to calculate the correlation coefficient between the number of pageviews and the difference:

* Using only one query, create a table with the columns: *uid*, *avg_diff*, *pageviews*:
* *uid* is the uids that exist in the test:
* *avg_diff* is the average delta between the first commit and the lab deadline per user:
* *pageviews* is the number of *Newsfeed* visits per user:
* Do not take the lab *’project1’* into account:
* Store it to the dataframe `views_diff`:
* Use the *Pandas* method `corr()` to calculate the correlation coefficient between the number of *pageviews* and the *difference*.

In [20]:
sql_query: str = """
                 SELECT
                     test.uid AS uid,
                     AVG((unixepoch(test.first_commit_ts) - deadlines.deadlines) / 3600) AS avg_diff,
                     COUNT(pageviews.datetime) AS pageviews
                 FROM
                     test
                 INNER JOIN
                     deadlines ON test.labname = deadlines.labs
                 LEFT OUTER JOIN
                     pageviews ON test.uid = pageviews.uid
                 WHERE
                     test.labname != 'project1'
                 GROUP BY
                     test.uid;
                 """

In [21]:
views_diff: DataFrame = pd.io.sql.read_sql(
    sql_query,
    conn
)

In [22]:
float(round(
    views_diff["pageviews"].corr(views_diff["avg_diff"]),
    4
))

-0.1858

In [23]:
# views_diff

# Close the connection:

In [24]:
conn.close()