In [20]:
import pandas as pd
from pathlib import Path

from hashlib import sha1

In [21]:
def compute_hash(email):
    return sha1(email.lower().encode('utf-8')).hexdigest()

In [67]:
mapping = {
    'Email Address': 'email',
    'Email address': 'email',
    'Your email (the same you used for signing up)': 'email',
    'How much time (in hours) did you spend on watching lectures and reading?': 'time_lectures',
    'How much time (in hours) did you spend on homework?': 'time_homework',
    'How much time (in hours) did you spend working on this project?': 'time_project',
    'How much time (in hours) did you spend evaluating this project?': 'time_evaluate',
    'Hash for the project you evaluate': 'hash_other',
}

In [68]:
courses = ["de-zoomcamp-2022", "mlops-zoomcamp-2022", "ml-zoomcamp-2023",
           "de-zoomcamp-2023", "mlops-zoomcamp-2023", "ml-zoomcamp-2022"]

In [69]:
prefix = Path("C:/Users/alexe/git/zoomcamp-scoring/")
data_out = Path('.') / 'data'

In [85]:
columns = {"time_lectures", "time_homework", "time_project", "time_evaluate", "hash_other"}

In [86]:
for course in courses:
    data_folder = prefix / course / 'data' / 'raw'
    files = list(data_folder.glob('*.csv'))

    course_data_out = data_out / course
    course_data_out.mkdir(parents=True, exist_ok=True)
    
    for f in files: 
        df = pd.read_csv(f)

        df = df.rename(columns=mapping)
        existing_columns = list(set(df.columns) & columns)
        
        if len(existing_columns) == 0:
            print(f'no data for {f}. columns: {df.columns}')
            continue

        df = df [["email"] + existing_columns]

        df.email = df.email.str.lower().str.strip()

        if 'eval' in f.name:
            df = df.drop_duplicates(subset=['email', 'hash_other'], keep='last')
            del df['hash_other']
        else:
            df = df.drop_duplicates(subset=['email'], keep='last')

        df = df.reset_index(drop=True)

        df.email = df.email + '_salt'
        df['email'] = df.email.apply(compute_hash)
        
        f_out = course_data_out / f.name
        df.to_csv(f_out, index=False)

no data for C:\Users\alexe\git\zoomcamp-scoring\mlops-zoomcamp-2022\data\raw\raw-leaderboard.csv. columns: Index(['Timestamp', 'email', 'Your name', 'LinkedIn', 'Github',
       'Link to your project', 'Other links',
       'Anything else you'd like to add to your record on the leaderboard?'],
      dtype='object')
no data for C:\Users\alexe\git\zoomcamp-scoring\de-zoomcamp-2023\data\raw\raw-leaderboard.csv. columns: Index(['Timestamp', 'email', 'Your name', 'LinkedIn', 'Github',
       'Link to your project', 'Other links',
       'Anything else you'd like to add to your record on the leaderboard?'],
      dtype='object')
no data for C:\Users\alexe\git\zoomcamp-scoring\ml-zoomcamp-2022\data\raw\competition.csv. columns: Index(['Timestamp', 'email', 'Team name on the leaderboard', 'Score',
       'Learning in public links',
       'I confirm that I filled in the form correctly'],
      dtype='object')
no data for C:\Users\alexe\git\zoomcamp-scoring\ml-zoomcamp-2022\data\raw\raw-leaderb