# Piazza Data Collection & Analysis
Zehua Li
April 2017

This notebook looks at the timing distributions of the question and answer generation on piazza.

In [None]:
# python setup

% matplotlib inline

from piazza_api.rpc import PiazzaRPC
from datetime import datetime
import time

import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

## Piazza connection using API

# -DON'T FORGET TO FILL THESE IN-

In [None]:
#p = PiazzaRPC(classcode) # course
#p.user_login(username, password) # login

## Functions

In [None]:
def to_time(t): # convert to unix timestamp (in seconds)
    fmt = '%Y-%m-%dT%H:%M:%SZ'
    return time.mktime((datetime.strptime(t, fmt)).timetuple())

In [None]:
def to_hour(t): # convert to unix timestamp (in seconds)
    fmt = '%Y-%m-%dT%H:%M:%SZ'
    d = datetime.strptime(t, fmt)
    h = d.hour*3600. + d.minute*60. + d.second
    return h

In [None]:
def diff_time(t1, t2):# subtract
    return int(abs(t1-t2))

In [None]:
def get_timing(cid,tag = None,only_ct = True):
    try:
        post = p.content_get(cid)
        if (post["type"] != "question"):
            return None # if not a question, ignore
        if (tag is not None):
            if(not set(tag).isdisjoint(post["tags"])):
                return None # if does not meet the requested tag, ignore
        c_time = to_time(post['created']) # time thread is created
        s_time = -1 # time thread is resolved by student, -1 if never
        i_time = -1 # time thread is resolved by instructor, -1 if never
        if (not only_ct):
            for x in post["children"]:
                if (x["type"]=="s_answer"):
                    s_time = to_time(x['created'])
                if (x["type"]=="i_answer"):
                    i_time = to_time(x['created'])
        return [c_time,s_time,i_time]
    except:
        pass # if post does not exist, ignore

In [None]:
def get_daily_timing(cid,tag = None):
    try:
        if (p.content_get(cid)["type"] != "question"):
            return None # if not a question, ignore
        if (tag is not None):
            if(tag not in p.content_get(cid)["tags"]):
                return None # if does not meet the requested tag, ignore
        c_time = to_hour(p.content_get(cid)['created'])
        i_time = -1 # time thread is resolved by instructor, -1 if never
        if (not only_ct):
            for x in post["children"]:
                if (x["type"]=="s_answer"):
                    s_time = to_time(x['created'])
                if (x["type"]=="i_answer"):
                    i_time = to_time(x['created'])
        return [c_time]
    except:
        pass # if post does not exist, ignore

In [None]:
def get_latency(cid,tag = None):
    try:
        if (p.content_get(cid)["type"] != "question"):
            return None # if not a question, ignore
        if (tag is not None):
            if(tag not in p.content_get(cid)["tags"]):
                return None # if does not meet the requested tag, ignore
        c_time = to_time(p.content_get(cid)['created']) # time thread is created
        s_time = -1 # time thread is resolved by student, -1 if never
        i_time = -1 # time thread is resolved by instructor, -1 if never
        for x in p.content_get(cid)["children"]:
            if (x["type"]=="s_answer"):
                s_time = diff_time(to_time(x['created']),c_time)
            if (x["type"]=="i_answer"):
                i_time = diff_time(to_time(x['created']),c_time)
        return [c_time,s_time,i_time]
    except:
        pass # if post does not exist, ignore

## Post types

In [None]:
quiz = ["quiz1","quiz2","quiz3","quiz4","quiz5"]
test = ["midterm_exam1","midterm_exam2"]
mp = ["pointers_gone_wild","vector","text_editor","shell","malloc","password_cracker","parallel_make","mapreduce","mp","networking_nightmare"]
lab = ["know_your_tools","extreme_edge_cases","utilities_unleased","mini_valgrind","ideal_indirection","mad_mad_access_pattern","finding_filesystems","chatroom","scheduler","super_linux_kernel","lab","teaching_threads","splendid_synchronization","deadlocked_diners","pied_piper"]

## Get Stats of Quiz Posts

In [None]:
[idx_start,idx_end,tag] = [0,3300,quiz]

t_c_arr = []
t_last = 0

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_timing(i,tag)
    if (t_cur != None):
        if (t_last == 0):
            t_last = t_cur[0]
        t_c_arr.append(t_cur[0]-t_last)
        t_last = t_cur[0]

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
arr = [x/3600. for x in t_c_arr]
print(*arr, sep=' ')

In [None]:
sum(arr)

In [None]:
fig, ax = plt.subplots(figsize=(11,8))
sns.distplot(t_c_arr, kde=False, rug=True, ax=ax, fit=stats.expon, label = "new post")
ax.set_xlabel('Time (seconds)', fontsize=14)
ax.set_ylabel('Number of post', fontsize=14)
ax.set_title('Kernel density estimation', fontsize=14)
plt.legend();

## Get Stats of Test Posts

In [None]:
[idx_start,idx_end,tag] = [0,3300,test]

t_c_arr = []
t_last = 0

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_timing(i,tag)
    if (t_cur != None):
        if (t_last == 0):
            t_last = t_cur[0]
        t_c_arr.append(t_cur[0]-t_last)
        t_last = t_cur[0]

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
test_arr = [x/3600. for x in t_c_arr]
print(*test_arr, sep=' ')

## Get Stats of Lab Posts

In [None]:
[idx_start,idx_end,tag] = [0,3300,lab]

t_c_arr = []
t_last = 0

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_timing(i,tag)
    if (t_cur != None):
        if (t_last == 0):
            t_last = t_cur[0]
        t_c_arr.append(t_cur[0]-t_last)
        t_last = t_cur[0]

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
lab_arr = [x/3600. for x in t_c_arr]
print(*lab_arr, sep=' ')

## Get Stats of MP Posts

In [None]:
[idx_start,idx_end,tag] = [0,3300,mp]

t_c_arr = []
t_last = 0

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_timing(i,tag)
    if (t_cur != None):
        if (t_last == 0):
            t_last = t_cur[0]
        t_c_arr.append(t_cur[0]-t_last)
        t_last = t_cur[0]

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
mp_arr = [x/3600. for x in t_c_arr]
print(*mp_arr, sep=' ')

## Get Stats of All Posts

In [None]:
[idx_start,idx_end,tag] = [0,3300,None]

t_c_arr = []
t_last = 0

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_timing(i,tag)
    if (t_cur != None):
        if (t_last == 0):
            t_last = t_cur[0]
        t_c_arr.append(t_cur[0]-t_last)
        t_last = t_cur[0]

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
arr = [x/3600. for x in t_c_arr]
print(*arr, sep=' ')

In [None]:
sum(arr)

In [None]:
len(arr)

## Instructor Answers

In [None]:
[idx_start,idx_end,tag] = [1,3200,None]

t_i_arr = []

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_latency(i,tag)
    if (t_cur != None):
        if (t_cur[2] != -1):
            t_i_arr.append(t_cur[2])

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
fig, ax = plt.subplots(figsize=(11,8))
sns.distplot(sorted(t_i_arr)[0:-100], kde=False, rug=True, ax=ax, fit=stats.lognorm, label = "Instructor answers")
ax.set_xlim([-10,50000])
ax.set_xlabel('Time (seconds)', fontsize=14)
ax.set_ylabel('Density of answers', fontsize=14)
ax.set_title('Rug plot with histo and lognormal fit for answer arrival time', fontsize=14)
plt.legend();

In [None]:
print(sorted(t_i_arr))

## Fit, Validate Distribution, then create random variables

In [None]:
shape, loc, scale = stats.lognorm.fit(t_i_arr)
print(shape, loc, scale)

In [None]:
stats.kstest(t_i_arr,'lognorm',args=(shape, loc, scale))

In [None]:
len(t_i_arr)

In [None]:
1.36/np.sqrt(1444)

In [None]:
l_ti = [np.log(x) for x in t_i_arr]

In [None]:
fig, ax = plt.subplots(figsize=(11,8))
sns.distplot(sorted(l_ti)[0:-100], kde=True, rug=True, ax=ax, fit=stats.norm, label = "Instructor answers")
#ax.set_xlim([-10,1000])
ax.set_xlabel('Time', fontsize=14)
ax.set_ylabel('Density', fontsize=14)
ax.set_title('Kernel density estimation and normal fit for log(answer arrival time)', fontsize=14)
plt.legend();

In [None]:
i_a_rv = stats.lognorm.rvs(shape, loc, scale, 2500, random_state = 12345)

In [None]:
rv = [x/3600. for x in i_a_rv]
print(*rv, sep=' ')

## Daily distribution (post)

In [None]:
[idx_start,idx_end] = [1,400]

t_h_arr = []
t_last = 0

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_daily_timing(i)
    if (t_cur != None):
        t_h_arr.append(t_cur[0])

print("--- %s seconds ---\n" % (time.time() - start_time))

## Percent of post answered by students

In [None]:
def get_percent_answered_by_student_and_not_inst(cid,tag = None):
    try:
        post = p.content_get(cid)
        if (post["type"] != "question"):
            return None # if not a question, ignore
        c_time = to_time(post['created']) # time thread is created
        bs = 0
        bi = 0
        for x in post["children"]:
            if (x["type"]=="s_answer"):
                bs = 1
            if (x["type"]=="i_answer"):
                bi = 1
        return bs*(1-bi)
    except:
        pass # if post does not exist, ignore

In [None]:
[idx_start,idx_end,tag] = [1,3200,None]

b_s_arr = []

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_percent_answered_by_student_and_not_inst(i,tag)
    if (t_cur != None):
        if (t_cur != []):
            b_s_arr.append(t_cur)

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
sum(b_s_arr)/len(b_s_arr)

## Student Answers

In [None]:
[idx_start,idx_end,tag] = [1,3200,None]

t_i_arr = []

start_time = time.time()

for i in range(idx_start,idx_end):
    t_cur = get_latency(i,tag)
    if (t_cur != None):
        if (t_cur[1] != -1):
            t_i_arr.append(t_cur[1])

print("--- %s seconds ---\n" % (time.time() - start_time))

In [None]:
fig, ax = plt.subplots(figsize=(11,8))
sns.distplot(sorted(t_i_arr)[0:-100], kde=False, rug=True, ax=ax, fit=stats.lognorm, label = "Student answers")
#ax.set_xlim([-10,1000])
ax.set_xlabel('Time (seconds)', fontsize=14)
ax.set_ylabel('Density of answers', fontsize=14)
ax.set_title('Rug plot with histo and lognormal fit for answer arrival time', fontsize=14)
plt.legend();

## Fit, Validate Distribution, then create random variables

In [None]:
shape, loc, scale = stats.lognorm.fit(t_i_arr)
print(shape, loc, scale)

In [None]:
stats.kstest(t_i_arr,'lognorm',args=(shape, loc, scale))

In [None]:
len(t_i_arr)

In [None]:
1.36/np.sqrt(1102)

In [None]:
s_a_rv = stats.lognorm.rvs(shape, loc, scale, 2500, random_state = 12345)

In [None]:
rv = [x/3600. for x in i_a_rv]
print(*rv, sep=' ')

In [None]:
shape, loc, scale = stats.lognorm.fit(rv)
print(shape, loc, scale)