In [1]:
# Make sure to also output the intermediary steps
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Auto-Cleaning Dirty Data: the Data Encoding Bot

[2IMM00] Seminar Data Mining
<br>
Angelo Majoor - 1030843
<br>
A.R.Majoor@student.tue.nl

Supervisor: dr. ir. J. (Joaquin) Vanschoren

Eindhoven University of Technology
<br>
Department of Mathematics and Computer Science
<br>
Data Mining Research Group

In [40]:
# Import all relevant libraries
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

import openml as oml
oml.config.apikey = '2d4efc0fbf4c75a890be14297c5ec1e4'
import sklearn

### The three different steps

As stated in the report, the process of creating the data encoding bot consists of three different steps:

 - Auto-detecting data types per feature (column)
 - Auto-detecting numeric, ordinal, categorical (integer) features
 - Auto-selecting encoding techniques for all features

## Set-up

In [3]:
# Load the OpenML100 data set
benchmark_suite = oml.study.get_study('OpenML100','tasks')

list_task_ids = []

# Find all task_ids
for task_id in benchmark_suite.tasks:
    list_task_ids.append(task_id)
    print(task_id)

#list_task_ids

task = oml.tasks.get_task(3)

X, y = task.get_X_and_y()
for i in range (0,5):
    print (X[i])
    print (y[i])

3
6
11
12
14
15
16
18
20
21
22
23
24
28
29
31
32
36
37
41
43
45
49
53
58
219
2074
2079
3021
3022
3481
3485
3492
3493
3494
3510
3512
3543
3549
3560
3561
3567
3573
3889
3891
3896
3899
3902
3903
3904
3913
3917
3918
3946
3948
3954
7592
9914
9946
9950
9952
9954
9955
9956
9957
9960
9964
9967
9968
9970
9971
9976
9977
9978
9979
9980
9981
9983
9985
9986
10093
10101
14964
14965
14966
14967
14968
14969
14970
34536
34537
34538
34539
125920
125921
125922
125923
146195
146606
146607
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.
  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.]
0
[ 1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.
  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.]
0
[ 1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.
  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.]
0
[ 1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  0.
  1.  

## Step 1: Auto-Detecting Data Types

In [160]:
raw_energy = pd.read_csv("energy_train_copy.csv")
#raw_energy.head()
raw_energy.dtypes

raw_weather = pd.read_csv("weather_train_copy.csv")

#--------------------------------------------------

#raw_data = pd.read_csv("ENTER_YOUR_CSV_FILE_NAME_HERE")

#--------------------------------------------------

# Function to automatically infer data data for a specific feature that has the standard 'object' data type
# Data types that we want to infer: boolean, date, float, integer, string
# Input: Pandas Dataframe consisting of one single feature (so n*1 in size)
# Output: Data type of the feature (in string format)
def autoInferObjects(raw_data_feature):
    dataType = ""
    types = ["date","float64","int64","string"] #Data types
    weights = [0,0,0,0] # Weights corresponding to the data types
    numberOfIndices = 100 # Number of different values to check in a feature
    
    featureLength = len(raw_data_feature) #Number of rows in the feature
    
    randomIndices = random.sample(range(0,featureLength), min(numberOfIndices,featureLength)) #Array of random indices
    
    # If the feature only contains two different unique values, then infer it as boolean
    if len(pd.unique(raw_data_feature)) == 2:
        dataType = "bool"
    else:
        for i in randomIndices:
            try:
                if ((raw_data_feature[i][2:3] == ('-' or '/') and raw_data_feature[i][5:6] == ('-' or '/')) or 
                (raw_data_feature[i][4:5] == ('-' or '/') and raw_data_feature[i][7:8] == ('-' or '/'))):
                    weights[0] += 1 #Date
                else:
                    weights[3] += 1 #String
                    #print("Try on Date: {}, data type: {}, index: {}".format(raw_data_feature[i],type(raw_data_feature[i]),i))
            except (TypeError,ValueError,IndexError):
                try:
                    int(raw_data_feature[i])
                    if ('.' in str(raw_data_feature[i])):
                        weights[1] += 1 #Float
                    else:
                        weights[2] += 1 #Integer
                except (TypeError,ValueError,IndexError):
                    weights[3] += 1 #String
                    #print("Try on Int: {}, data type: {}, index: {}".format(raw_data_feature[i],type(raw_data_feature[i]),i))
    
    print ("Date: {}, Float64: {}, Int64: {}, String: {}".format(weights[0],weights[1],weights[2],weights[3]))
    dataType = types[weights.index(max(weights))]
    return dataType


# Function to automatically infer data types for every single feature in a raw data set
# Input: Pandas Dataframe created directly from the raw data with the pd.read_csv function
# Output: List of data types, one data type for each feature
def autoDetectDataTypes(raw_data):
    result = []
    
    for column in raw_data:
        if raw_data.dtypes[column] == "object":
            result.append(autoInferObjects(raw_data[column]))
            # Auto-infer in step 1
        elif raw_data.dtypes[column] == "int64":
            result.append("int64")
            # Go to step 2
        else:
            # The only remaining data type is 'float64', which needs no special treatment
            result.append("float64")
        
    return result

autoDetectDataTypes(raw_energy)

#s = raw_energy['solar_production']
#s

#autoInferObjects(s)

  interactivity=interactivity, compiler=compiler, result=result)


date                       object
time                       object
seconds                     int64
total_consumption         float64
total_production          float64
solar_production           object
total_solar_production    float64
dtype: object

Date: 100, Float64: 0, Int64: 0, String: 0
Date: 0, Float64: 0, Int64: 0, String: 100
Date: 0, Float64: 0, Int64: 76, String: 24


['date', 'string', 'int64', 'float64', 'float64', 'int64', 'float64']