In [54]:
import pandas as pd
import sqlite3

## 1. Create a connection to the database using the sqlite3 library.

In [55]:
db_path = "../data/checking-logs.sqlite"
conn = sqlite3.connect(db_path)

## 2. Get the schema of the test table.


In [56]:
test = pd.read_sql_query("PRAGMA table_info(test);", conn)
test 

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,uid,TEXT,0,,0
1,1,labname,TEXT,0,,0
2,2,first_commit_ts,TIMESTAMP,0,,0
3,3,first_view_ts,TIMESTAMP,0,,0


## 3. Get only the first ten rows of the test table to see what it looks like.

In [57]:
test = pd.read_sql_query("SELECT * FROM test LIMIT 10;", conn)
test

Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
1,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
2,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
3,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136
4,user_1,laba06s,2020-05-20 12:23:37.289724,2020-04-26 21:53:59.624136
5,user_1,project1,2020-05-14 20:56:08.898880,2020-04-26 21:53:59.624136
6,user_10,laba04,2020-04-25 08:24:52.696624,2020-04-18 12:19:50.182714
7,user_10,laba04s,2020-04-25 08:37:54.604222,2020-04-18 12:19:50.182714
8,user_10,laba05,2020-05-01 19:27:26.063245,2020-04-18 12:19:50.182714
9,user_10,laba06,2020-05-19 11:39:28.885637,2020-04-18 12:19:50.182714


In [58]:
test = pd.read_sql_query("SELECT * FROM deadlines LIMIT 10;", conn)
test

Unnamed: 0,index,labs,deadlines
0,0,laba04,1587945599
1,1,laba04s,1587945599
2,2,laba05,1588550399
3,4,laba06,1590364799
4,5,laba06s,1590364799
5,3,project1,1589673599


## 4. Find the minimum value of the delta between the first commit and the deadline of the corresponding lab for all users using only one query.
- Do this by joining the table with the deadlines table.
- The difference should be displayed in hours.
- Do not take lab project1 into account; it has longer deadlines and will be an outlier.
- The value should be stored in the dataframe df_min with the corresponding uid.


In [59]:
df_min = pd.read_sql_query("""
    SELECT t.uid, MIN((CAST(strftime('%s',t.first_commit_ts) AS INTEGER ) - d.deadlines ) / 3600) as min_diff
    FROM test t
    JOIN deadlines d ON t.labname = d.labs
    WHERE t.labname != 'project1'
    GROUP BY t.uid
    ORDER BY min_diff
    LIMIT 1
""", conn)

df_min

Unnamed: 0,uid,min_diff
0,user_30,-202


## 5. Do the same thing for the maximum, but use only one query. The dataframe name is df_max.

In [60]:
df_max = pd.read_sql_query("""
    SELECT t.uid, MAX((CAST(strftime('%s',t.first_commit_ts) AS INTEGER ) - d.deadlines ) / 3600) as max_diff
    FROM test t
    JOIN deadlines d ON t.labname = d.labs
    WHERE t.labname != 'project1'
    GROUP BY t.uid
    ORDER BY max_diff DESC
    LIMIT 1
""", conn)

df_max

Unnamed: 0,uid,max_diff
0,user_25,-2


## 6. Do the same thing, but for the average. Use only one query. This time, your dataframe should not include the uid column. The dataframe name is df_avg.

In [61]:
df_avg = pd.read_sql_query("""
    SELECT AVG((CAST(strftime('%s',t.first_commit_ts) AS INTEGER ) - d.deadlines ) / 3600) as avg_diff
    FROM test t
    JOIN deadlines d ON t.labname = d.labs
    WHERE t.labname != 'project1'
    LIMIT 1
""", conn)

df_avg

Unnamed: 0,avg_diff
0,-89.125


## 7. We want to test the hypothesis that users who visited the newsfeed just a few times have a lower delta between the first commit and the deadline. To do this, calculate the correlation coefficient between the number of pageviews and the difference.
- Using only one query, create a table with the following columns: "uid", "avg_diff", and "pageviews".
- "uid" is the uids that exist in the test.
- "avg_diff" is the average delta between the first commit and the lab deadline per user.
- "pageviews" is the number of Newsfeed visits per user.
- Do not take the lab project1 into account.
- Store it in the dataframe views_diff.
- Use the Pandas corr() method to calculate the correlation coefficient between the number of pageviews and the difference.

In [62]:
views_diff = pd.read_sql_query("""
    SELECT 
        t.uid,
        AVG((d.deadlines - strftime('%s', t.first_commit_ts)) / 3600.0) AS avg_diff,
       COUNT(p.uid) AS pageviews
    FROM test t
    JOIN deadlines d ON t.labname = d.labs
    LEFT JOIN pageviews p ON t.uid = p.uid
    WHERE t.labname != 'project1'
    GROUP BY t.uid
""", conn)
views_diff = views_diff[['pageviews', 'avg_diff']]
views_diff

Unnamed: 0,pageviews,avg_diff
0,140,65.119778
1,445,75.242444
2,429,159.568796
3,235,62.207667
4,9,6.368148
5,64,99.440417
6,40,96.111181
7,895,93.474944
8,745,86.793833
9,1585,105.738222


In [63]:
correlation = views_diff.corr()
correlation

Unnamed: 0,pageviews,avg_diff
pageviews,1.0,0.185042
avg_diff,0.185042,1.0


In [64]:
conn.close()