## Datasets creation

In [1]:
# read names and surnames in 'filename1' and 'filename2'
def read_names(filename1, filename2):
    file1 = codecs.open(filename1,'r','utf-8')
    file2 = codecs.open(filename2,'r','utf-8')
    names = file1.readlines()
    surnames = file2.readlines()
    for i in range (len(names)):
        names[i] = names[i].replace("\r","").replace("\n","")
    for i in range (len(surnames)):
        surnames[i] = surnames[i].replace("\r","").replace("\n","")
    shuffle(names)
    shuffle(surnames)
    return names, surnames

In [2]:
# create a dataset of randomly chosen students
def create_students(filename1, filename2, students=20000):
    if exists("data/students.csv"):
        return
    names, surnames = read_names(filename1, filename2)
    df = pd.DataFrame(columns=['E-mail'])
    emails = []
    # create the email address associated to each student
    for i in range(students):
        email = (choice(names) + '.' + choice(surnames)).lower().replace("'", "").replace(" ","") + '@studium.unict.it'
        emails.append(email)
    df["E-mail"] = emails
    df.to_csv("data/students.csv", index=False)

In [3]:
# create a mapping table containing the pairs (E-mail, Pseudonym)
def create_mapping_table():
    if exists("data/mapping_table.csv"):
        return
    df = pd.read_csv("data/students.csv")
    pseudonyms = []
    # compute the hash with sha256 for each email address
    for i in range(len(df)):
        pseudonyms.append(hashlib.sha256(df.iloc[i,0].encode()).hexdigest())
    df["Pseudonym"] = pseudonyms
    df.to_csv("data/mapping_table.csv", index=False)

In [4]:
# create the dataset of IS students with their project result
def create_is(students=50):
    if exists("data/is_projects_results.csv"):
        return
    df = pd.read_csv("data/mapping_table.csv")
    df = df.sample(students).drop("E-mail", axis=1)
    passed = [0,1]
    # 0= not passed ; 1= passed 
    df["Passed"] = choice(passed, students, p=[0.35,0.65])
    df.to_csv("data/is_projects_results.csv", index=False)

In [5]:
#create the ersu ranking list
def create_ersu():
    if exists("data/ersu_results.csv"):
        return
    df = pd.read_csv("data/mapping_table.csv")
    df = df.drop("E-mail", axis=1)
    granted = [0,1]
    # 0= not holder ; 1= holder
    df["Holder"] = choice(granted, len(df), p=[0.25,0.75])
    df.to_csv("data/ersu_results.csv", index=False)

In [6]:
# create almalaurea dataset
def create_almalaurea():
    if exists("data/almalaurea_report.csv"):
        return
    df = pd.read_csv("data/mapping_table.csv")
    df = df.drop("E-mail", axis=1)
    off_site = [0,1]
    # 0= not off-site ; 1= off-site
    df["Off-site"] = choice(off_site, len(df), p=[0.6,0.4])
    df.to_csv("data/almalaurea_report.csv", index=False)

In [7]:
# create msoutlook dataset
def create_msoutlook():
    if exists("data/msoutlook_report.csv"):
        return
    df = pd.read_csv("data/mapping_table.csv")
    df = df.drop("E-mail", axis=1)
    sex = ['M','F']
    df["Sex"] = choice(sex, len(df), p=[0.5,0.5])
    df.to_csv("data/msoutlook_report.csv", index=False)

In [8]:
import pandas as pd
import hashlib
from numpy.random import choice, seed, shuffle
from os.path import exists
import codecs

# set a seed to have repeatable results
seed(11)

# create a students dataset
create_students('data/nomi_italiani.txt', 'data/cognomi_italiani.txt')

In [9]:
# create the mapping table
create_mapping_table()
# create IS students
create_is()
# create  ersu dataset
create_ersu()
# create almalaurea dataset
create_almalaurea()
# create msoutlook dataset
create_msoutlook()