In [4]:
import pandas as pd
import numpy as np
import dateparser
from matplotlib import pyplot as plt

In [5]:
odi_data = pd.read_csv('ODI-2018.csv')
odi_clean = odi_data.copy()

In [6]:
odi_data.head()

Unnamed: 0,Timestamp,What programme are you in?,Have you taken a course on machine learning?,Have you taken a course on information retrieval?,Have you taken a course on statistics?,Have you taken a course on databases?,What is your gender?,Chocolate makes you.....,When is your birthday (date)?,Number of neighbors sitting around you?,Did you stand up?,"You can get £100 if you win a local DM competition, or we don’t hold any competitions and I give everyone some money (not the same amount!). How much do you think you would get then?",Give a random number,Time you went to be Yesterday,What makes a good day for you (1)?,What makes a good day for you (2)?
0,4/5/2018 11:22:56,Duisenberg Quantitative Risk Management,no,0,mu,nee,male,neither,10/12/1994,49000,no,0.05,7,1,Productive,Good sleep
1,4/5/2018 11:23:04,Computer Science,no,0,unknown,ja,male,I have no idea what you are talking about,06-08-1993,100,yes,The formmer,394749,1 a m,got a 8.5,got 2 8.5s
2,4/5/2018 11:23:06,Business Analytics,yes,1,sigma,ja,male,neither,25 december 92,5,no,Not enough,6,23,Food,Sport
3,4/5/2018 11:23:50,BA,yes,1,mu,ja,male,I have no idea what you are talking about,01-02-1995,2,no,0,8,0.3,-,-
4,4/5/2018 11:23:59,Master Computer Science: Big Data Engineering,no,0,sigma,ja,male,I have no idea what you are talking about,09.01.1994,6,no,0,8,0:00,sleep,beer


In [7]:
odi_data.columns

Index(['Timestamp', 'What programme are you in?',
       'Have you taken a course on machine learning?',
       'Have you taken a course on information retrieval?',
       'Have you taken a course on statistics?',
       'Have you taken a course on databases?', 'What is your gender?',
       'Chocolate makes you.....', 'When is your birthday (date)?',
       'Number of neighbors sitting around you?', 'Did you stand up?',
       'You can get £100 if you win a local DM competition, or we don’t hold any competitions and I give everyone some money (not the same amount!). How much do you think you would get then? ',
       'Give a random number', 'Time you went to be Yesterday',
       'What makes a good day for you (1)?',
       'What makes a good day for you (2)?'],
      dtype='object')

In [8]:
odi_data.shape

(217, 16)

In [9]:
odi_data.isnull()

Unnamed: 0,Timestamp,What programme are you in?,Have you taken a course on machine learning?,Have you taken a course on information retrieval?,Have you taken a course on statistics?,Have you taken a course on databases?,What is your gender?,Chocolate makes you.....,When is your birthday (date)?,Number of neighbors sitting around you?,Did you stand up?,"You can get £100 if you win a local DM competition, or we don’t hold any competitions and I give everyone some money (not the same amount!). How much do you think you would get then?",Give a random number,Time you went to be Yesterday,What makes a good day for you (1)?,What makes a good day for you (2)?
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [10]:
odi_clean.drop(odi_clean.index[0], inplace=True)

In [11]:
odi_clean.isnull().any()

Timestamp                                                                                                                                                                                  False
What programme are you in?                                                                                                                                                                 False
Have you taken a course on machine learning?                                                                                                                                               False
Have you taken a course on information retrieval?                                                                                                                                          False
Have you taken a course on statistics?                                                                                                                                                     False
Have you taken a course on database

In [12]:
odi_data.dtypes

Timestamp                                                                                                                                                                                  object
What programme are you in?                                                                                                                                                                 object
Have you taken a course on machine learning?                                                                                                                                               object
Have you taken a course on information retrieval?                                                                                                                                          object
Have you taken a course on statistics?                                                                                                                                                     object
Have you taken a course on dat

In [13]:
programme_dict = {
    'Duisenberg':'Duisenberg Honor Programme',
    'DHP':'Duisenberg Honor Programme',
    'Science, Business':'SBI',
    'Computer Science':'CS',
    'CS':'CS',
    'Big':'CS',
    'cs':'CS',
    'Business Analytics':'BA',
    'Business analytics':'BA',
    'business analytics':'BA',
    'BA':'BA',
    'Drug':'Drug Discovery and Safety',
    'Computational':'CLS',
    'Como':'CLS',
    'CLS':'CLS',
    'CSL':'CLS',
    'Cls':'CLS',
    'trics':'EOR',
    'EOR':'EOR',
    'OR':'EOR',
    'Bioinfor':'Bioinformatics',
    'bioinformatics':'Bioinformatics',
    'AI':'AI',
    'Artificial':'AI',
    'A. I.':'AI',
    'Ai':'AI',
    'Exchange':'Exchange',
    'QRM':'QRM',
    'Quantitative':'QRM',
    'Human Movement':'Human Movement Science',
    'PhD':'PhD student',
    'Data Mining':'unknown',
    '21-':'unknown'
}

In [14]:
# introduce unique classes for the different programmes
for key, value in programme_dict.items():
    odi_clean.iloc[:,1] = odi_clean.iloc[:,1].apply(lambda x: value if key in x else x)

In [15]:
# chocolate
odi_clean.iloc[:,7] = odi_clean.iloc[:,7].apply(lambda x: 'ignorant' if 'no idea' in x else x)

In [16]:
# neighbors
# handle strings and numbers greater than 8 as missing values (represented by -1)
odi_clean.iloc[:,9] = pd.to_numeric(odi_clean.iloc[:,9], errors='coerce')
odi_clean.iloc[:,9] = odi_clean.iloc[:,9].apply(lambda x: int(x) if np.isfinite(x) and x<=8 else -1)
odi_clean.iloc[:,9]

1     -1
2      5
3      2
4      6
5      4
6      5
7      5
8      2
9      7
10     8
11     6
12     8
13     6
14     2
15     1
16     1
17     3
18     2
19     1
20     7
21     2
22     1
23     5
24     8
25     8
26     2
27     2
28     5
29     1
30     7
      ..
187    0
188    3
189    6
190    3
191    3
192    5
193    1
194    4
195    5
196    4
197   -1
198    7
199    7
200    3
201    1
202    1
203    1
204    5
205    2
206   -1
207    7
208    1
209    1
210    6
211    2
212    3
213    7
214    8
215    5
216    3
Name: Number of neighbors sitting around you?, Length: 216, dtype: int64

In [17]:
# random number
# handle strings and numbers greater than 10 as missing values (represented by -1)
odi_clean.iloc[:,12] = pd.to_numeric(odi_clean.iloc[:,12], errors='coerce')
odi_clean.iloc[:,12] = odi_clean.iloc[:,12].apply(lambda x: int(x) if np.isfinite(x) and x<=10 else -1)
odi_clean.iloc[:,12]

1      -1
2       6
3       8
4       8
5      -1
6      -1
7       3
8      -1
9       5
10      8
11     -1
12     -1
13     -1
14     -1
15      7
16      7
17      6
18     -1
19      7
20     -1
21      2
22      2
23      6
24      7
25      4
26     -1
27      9
28      4
29      3
30     -1
       ..
187     3
188     4
189     9
190    10
191     1
192     9
193     2
194    -1
195    -1
196     9
197     8
198     7
199     3
200    -1
201     4
202    -1
203     7
204     6
205     4
206     7
207     7
208     3
209     5
210     9
211    -1
212     2
213    -1
214     2
215    10
216     6
Name: Give a random number, Length: 216, dtype: int64

In [18]:
birthdays = odi_clean.iloc[:,8]
bedtimes = odi_data.iloc[:,13]

bdays_formatted = []
bedtimes_formatted = []
for i,bday in enumerate(birthdays):
    # parse the dates and format them uniformly into datetime object
    #bdays_formatted.append(dateparser.parse(bday, settings={'STRICT_PARSING': True}))
    bdays_formatted.append(dateparser.date.DateDataParser().get_date_data(bday))
    bedtimes_formatted.append(dateparser.date.DateDataParser().get_date_data(bedtimes[i]))
    
years = []
months = []
days = []
hours = []
minutes = []
for i,bday in enumerate(bdays_formatted):
    if bday.get('date_obj') is not None:
        if bday.get('date_obj').year == 2018:
            years.append(-1)
        else:
            years.append(bday.get('date_obj').year)
        months.append(bday.get('date_obj').month)
        days.append(bday.get('date_obj').day)
    else:
        years.append(-1)
        months.append(-1)
        days.append(-1)
    if bedtimes_formatted[i].get('date_obj') is not None:
        hours.append(bedtimes_formatted[i].get('date_obj').hour)
        minutes.append(bedtimes_formatted[i].get('date_obj').minute)
    else:
        hours.append(-1)
        minutes.append(-1)
        
odi_clean['Year'] = years
odi_clean['Month'] = months
odi_clean['Days'] = days
odi_clean['Hours'] = hours
odi_clean['Minutes'] = minutes

odi_clean.iloc[:,2].replace('yes', 1, inplace=True)
odi_clean.iloc[:,2].replace('no', 0, inplace=True)
odi_clean.iloc[:,2].replace('unknown', -1, inplace=True)
odi_clean.iloc[:,3].replace('unknown', -1, inplace=True)
odi_clean.iloc[:,4].replace('sigma',0,inplace=True)
odi_clean.iloc[:,4].replace('mu',1,inplace=True)
odi_clean.iloc[:,4].replace('unknown',-1,inplace=True)
odi_clean.iloc[:,5].replace('ja',1,inplace=True)
odi_clean.iloc[:,5].replace('nee',0,inplace=True)
odi_clean.iloc[:,5].replace('unknown', -1, inplace=True)