In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import dateparser

In [2]:
odi_data = pd.read_csv('ODI-2018.csv')
odi_clean = odi_data.copy()


In [3]:
odi_data.head()


Unnamed: 0,Timestamp,What programme are you in?,Have you taken a course on machine learning?,Have you taken a course on information retrieval?,Have you taken a course on statistics?,Have you taken a course on databases?,What is your gender?,Chocolate makes you.....,When is your birthday (date)?,Number of neighbors sitting around you?,Did you stand up?,"You can get £100 if you win a local DM competition, or we don’t hold any competitions and I give everyone some money (not the same amount!). How much do you think you would get then?",Give a random number,Time you went to be Yesterday,What makes a good day for you (1)?,What makes a good day for you (2)?
0,4/5/2018 11:22:56,Duisenberg Quantitative Risk Management,no,0,mu,nee,male,neither,10/12/1994,49000,no,0.05,7,1,Productive,Good sleep
1,4/5/2018 11:23:04,Computer Science,no,0,unknown,ja,male,I have no idea what you are talking about,06-08-1993,100,yes,The formmer,394749,1 a m,got a 8.5,got 2 8.5s
2,4/5/2018 11:23:06,Business Analytics,yes,1,sigma,ja,male,neither,25 december 92,5,no,Not enough,6,23,Food,Sport
3,4/5/2018 11:23:50,BA,yes,1,mu,ja,male,I have no idea what you are talking about,01-02-1995,2,no,0,8,0.3,-,-
4,4/5/2018 11:23:59,Master Computer Science: Big Data Engineering,no,0,sigma,ja,male,I have no idea what you are talking about,09.01.1994,6,no,0,8,0:00,sleep,beer


In [4]:
odi_data.columns


Index([u'Timestamp', u'What programme are you in?',
       u'Have you taken a course on machine learning?',
       u'Have you taken a course on information retrieval?',
       u'Have you taken a course on statistics?',
       u'Have you taken a course on databases?', u'What is your gender?',
       u'Chocolate makes you.....', u'When is your birthday (date)?',
       u'Number of neighbors sitting around you?', u'Did you stand up?',
       u'You can get £100 if you win a local DM competition, or we don’t hold any competitions and I give everyone some money (not the same amount!). How much do you think you would get then? ',
       u'Give a random number', u'Time you went to be Yesterday',
       u'What makes a good day for you (1)?',
       u'What makes a good day for you (2)?'],
      dtype='object')

In [5]:
odi_data.shape


(217, 16)

In [6]:
odi_data.isnull()


Unnamed: 0,Timestamp,What programme are you in?,Have you taken a course on machine learning?,Have you taken a course on information retrieval?,Have you taken a course on statistics?,Have you taken a course on databases?,What is your gender?,Chocolate makes you.....,When is your birthday (date)?,Number of neighbors sitting around you?,Did you stand up?,"You can get £100 if you win a local DM competition, or we don’t hold any competitions and I give everyone some money (not the same amount!). How much do you think you would get then?",Give a random number,Time you went to be Yesterday,What makes a good day for you (1)?,What makes a good day for you (2)?
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
programme_dict = {
    'Duisenberg':'Duisenberg Honor Programme',
    'DHP':'Duisenberg Honor Programme',
    'Science, Business':'SBI',
    'Computer Science':'CS',
    'CS':'CS',
    'Big':'CS',
    'cs':'CS',
    'Business Analytics':'BA',
    'Business analytics':'BA',
    'business analytics':'BA',
    'BA':'BA',
    'Drug':'Drug Discovery and Safety',
    'Computational':'CLS',
    'Como':'CLS',
    'CLS':'CLS',
    'CSL':'CLS',
    'Cls':'CLS',
    'trics':'EOR',
    'EOR':'EOR',
    'OR':'EOR',
    'Bioinfor':'Bioinformatics',
    'bioinformatics':'Bioinformatics',
    'AI':'AI',
    'Artificial':'AI',
    'A. I.':'AI',
    'Ai':'AI',
    'Exchange':'Exchange',
    'QRM':'QRM',
    'Quantitative':'QRM',
    'Human Movement':'Human Movement Science',
    'PhD':'PhD student',
    'Data Mining':'unknown',
    '21-':'unknown'
}


In [9]:
# introduce unique classes for the different programmes
for key, value in programme_dict.items():
    odi_clean.iloc[:,1] = odi_clean.iloc[:,1].apply(lambda x: value if key in x else x)
#odi_clean.iloc[:,1]



In [10]:
# chocolate
odi_clean.iloc[:,7] = odi_clean.iloc[:,7].apply(lambda x: 'ignorant' if 'no idea' in x else x)
#odi_clean.iloc[:,7]



In [11]:
# neighbors
# handle strings and numbers greater than 8 as missing values (represented by -1)
odi_clean.iloc[:,9] = pd.to_numeric(odi_clean.iloc[:,9], errors='coerce')
odi_clean.iloc[:,9] = odi_clean.iloc[:,9].apply(lambda x: int(x) if np.isfinite(x) and x<=8 else -1)
#odi_clean.iloc[:,9]



In [12]:
# random number
# handle strings and numbers greater than 10 as missing values (represented by -1)

odi_clean.iloc[:,12] = pd.to_numeric(odi_clean.iloc[:,12], errors='coerce')
odi_clean.iloc[:,12] = odi_clean.iloc[:,12].apply(lambda x: int(x) if np.isfinite(x) and x<=10 else -1)
#odi_clean.iloc[:,12]


In [14]:
birthdays = odi_clean.iloc[:,8]
bedtimes = odi_data.iloc[:,13]

bdays_formatted = []
bedtimes_formatted = []
for i,bday in enumerate(birthdays):
    # parse the dates and format them uniformly into datetime object
    #bdays_formatted.append(dateparser.parse(bday, settings={'STRICT_PARSING': True}))
    bdays_formatted.append(dateparser.date.DateDataParser().get_date_data(bday))
    bedtimes_formatted.append(dateparser.date.DateDataParser().get_date_data(bedtimes[i]))
    
years = []
months = []
days = []
hours = []
minutes = []
for i,bday in enumerate(bdays_formatted):
    if bday.get('date_obj') is not None:
        if bday.get('date_obj').year == 2018:
            years.append(-1)
        else:
            years.append(bday.get('date_obj').year)
        months.append(bday.get('date_obj').month)
        days.append(bday.get('date_obj').day)
    else:
        years.append(-1)
        months.append(-1)
        days.append(-1)
    if bedtimes_formatted[i].get('date_obj') is not None:
        hours.append(bedtimes_formatted[i].get('date_obj').hour)
        minutes.append(bedtimes_formatted[i].get('date_obj').minute)
    else:
        hours.append(-1)
        minutes.append(-1)
        
odi_clean['Year'] = years
odi_clean['Month'] = months
odi_clean['Days'] = days
odi_clean['Hours'] = hours
odi_clean['Minutes'] = minutes

odi_clean.iloc[:,2].replace('yes', 1, inplace=True)
odi_clean.iloc[:,2].replace('no', 0, inplace=True)
odi_clean.iloc[:,2].replace('unknown', -1, inplace=True)
odi_clean.iloc[:,3].replace('unknown', -1, inplace=True)
odi_clean.iloc[:,4].replace('sigma',0,inplace=True)
odi_clean.iloc[:,4].replace('mu',1,inplace=True)
odi_clean.iloc[:,4].replace('unknown',-1,inplace=True)
odi_clean.iloc[:,5].replace('ja',1,inplace=True)
odi_clean.iloc[:,5].replace('nee',0,inplace=True)
odi_clean.iloc[:,5].replace('unknown', -1, inplace=True)
odi_clean.iloc[:,10].replace('yes', 1, inplace=True)
odi_clean.iloc[:,10].replace('no', 0, inplace=True)
odi_clean.iloc[:,10].replace('unknown', -1, inplace=True)

In [15]:
values = [' pond', ' pound', ' euros', 'euro', 'c', "€ ?:\)"]
odi_clean.iloc[:,11] = odi_clean.iloc[:,11].map(lambda x: x.lstrip('£$').rstrip('£'))
for val in values:
    odi_clean.iloc[:,11] = odi_clean.iloc[:,11].str.replace(val, '')
odi_clean.iloc[:,11] = odi_clean.iloc[:,11].str.replace(',', '.')
odi_clean.iloc[:,11] = pd.to_numeric(odi_clean.iloc[:,11], errors='coerce')
odi_clean.iloc[:,11] = odi_clean.iloc[:,11].apply(lambda x: int(x) if np.isfinite(x) else -1)



#odi_clean.iloc[:,11]

In [16]:
#df.drop(columns=['B', 'C'])
odi_clean.columns = ['timestamp', 'programme', 'ML', 'IR', 'stat', 'DB', 'gender', 'chocolate', 'BD', 'neighbours','stand', 'money', 'random', 'bed_time', 'good(1)', 'good(2)', 'y_birth', 'm_birth', 'd_birth', 'h_bed', 'm_bed']
odi_clean.drop(columns=['BD', 'bed_time'], inplace=True)

In [18]:
odi_clean.tail()

Unnamed: 0,timestamp,programme,ML,IR,stat,DB,gender,chocolate,neighbours,stand,money,random,good(1),good(2),y_birth,m_birth,d_birth,h_bed,m_bed
212,4/5/2018 11:38:01,CS,0,0,1,0,male,fat,3,0,2,2,Freedom,Money,1991,11,22,2,0
213,4/5/2018 12:10:39,AI,0,0,-1,-1,female,neither,7,-1,0,-1,Sun,Activities,1993,5,20,0,0
214,4/5/2018 13:15:17,AI,1,1,1,0,male,fat,8,0,11,2,The foresight of a night out with friends,The taste of that first sip of cold beer on a ...,1992,7,30,4,0
215,4/5/2018 15:09:53,CS,0,1,1,0,male,neither,5,0,-1,10,Good company,Well rested and structured,1991,4,12,1,30
216,4/5/2018 18:04:54,BA,1,1,1,1,male,fat,3,0,80,6,Sleep,Relax,1994,7,10,1,0


In [20]:
odi_clean.to_csv('ODI-2018_clean.csv')