In [1]:
import pandas as pd
from hashlib import sha1

In [2]:
import os
from glob import glob

In [3]:
def compute_hash(email):
    return sha1(email.lower().encode('utf-8')).hexdigest()

In [4]:
DATA_DIR = 'data'
RAW_DATA = f'..\\{DATA_DIR}\\raw\\'
STATS_DATA = f'..\\{DATA_DIR}\\statistics\\'

In [5]:
dfs = []

mapping = {
    'Email address': 'email',
    'Email Address': 'email',
    'How much time (in hours) did you spend on watching lectures and reading?': 'time_lectures',
    'How much time (in hours) did you spend on homework?': 'time_homework',
}

for f in glob(RAW_DATA + 'homework-*.csv'):
    if 'result' in f:
        continue
    df = pd.read_csv(f).rename(columns=mapping)[['email', 'time_lectures', 'time_homework']]
    df['what'] = os.path.basename(f[:-4])
    
    df.email = df.email.str.lower().str.strip()
    df = df.drop_duplicates(subset=['email'], keep='last').reset_index(drop=True)
    df.email = df.email + '_salt'
    df['email'] = df.email.apply(compute_hash)
    
    dfs.append(df)

In [6]:
mapping = {
    'Email address': 'email',
    'How much time (in hours) did you spend working on this project?': 'time_homework',
}

for f in glob(RAW_DATA + 'project-*-submissions.csv'):

    df = pd.read_csv(f)
    df = df.rename(columns=mapping)[['email', 'time_homework']]

    df['what'] = os.path.basename(f[:-4])

    df.email = df.email.str.lower().str.strip()
    df = df.drop_duplicates(subset=['email'], keep='last').reset_index(drop=True)
    df.email = df.email + '_salt'
    df['email'] = df.email.apply(compute_hash)
    
    dfs.append(df)

In [7]:
mapping = {
    'Email address': 'email',
    'How much time (in hours) did you spend evaluating this project?': 'time_homework',
}

for f in glob(RAW_DATA + 'project-*-eval.csv'):

    df = pd.read_csv(f)    
    df = df.rename(columns=mapping)[['email', 'time_homework']]

    df['what'] = os.path.basename(f[:-4])

    df.email = df.email.str.lower().str.strip()
    df = df.drop_duplicates(subset=['email'], keep='last').reset_index(drop=True)
    df.email = df.email + '_salt'
    df['email'] = df.email.apply(compute_hash)

    dfs.append(df)

In [8]:
df = pd.concat(dfs)

In [9]:
if not os.path.exists(STATS_DATA):
    os.mkdir(STATS_DATA)

In [10]:
df.to_csv(STATS_DATA + 'time_spent.csv', index=False)

In [11]:
!ls ..\data\statistics

time_spent.csv
