# Import Pandas and Rename the Columns 

In [None]:
import pandas as pd

benchmark_data = pd.read_csv('sklearn-benchmark-data.tsv.gz', sep='\t')
benchmark_data.head()
benchmark_data.rename(columns={'heart-c':'Dataset_Name',
                               'GradientBoostingClassifier':'Method_Name',
                               'loss=exponential,learning_rate=10.0,n_estimators=100,max_depth=3,max_features=sqrt,warm_start=True':'Parameters',
                               '0.723684210526':'Test_Score'},inplace=True)

#benchmark_data

# Preliminary Analysis

In [None]:
benchmark_data.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

benchmark_data.boxplot('Test_Score', by='Parameters')
;

# List all the methods so as to divide the dataset into multiple dataframes


In [None]:
names_list=benchmark_data['Dataset_Name'].unique().tolist()
methods_list=benchmark_data['Method_Name'].unique().tolist()
#names_list
#methods_list


In [None]:
len(methods_list)

In [None]:
#split the dataframe into multiple dataframes by the method name
methods_list=benchmark_data['Method_Name'].unique().tolist()
MethodWiseData={}
for method_name in methods_list:
    MethodWiseData[method_name] = benchmark_data[benchmark_data.Method_Name==method_name]
    

# Make a folder to save the file

In [116]:
# make a folder and save a file
import os
if not os.path.isdir('HPCC_Benchmark_Results'):
    os.mkdir('HPCC_Benchmark_Results') 

MethodWiseData['LogisticRegression'].to_pickle('HPCC_Benchmark_Results/LogisticRegression.tsv.gz')


In [117]:
#To read the file
# QUESTION: Why does it not work with csv.read
Method_Type = pd.read_pickle('HPCC_Benchmark_Results/LogisticRegression.tsv.gz')

Note: To run the code for all parameters, it is required to change in the above section the method name appropriately. Also below the number of parameters have to be inputted approprietly and the parameter name has to be changed as well to achieve the data cleaning for all methods.

# Split the Parameters columns (Every method has different number of parameters)

In [118]:
Param_Split = pd.DataFrame(Method_Type.Parameters.str.split(',').tolist(),
                                   columns = ['Param1','Param2'])
#Param_Split

In [119]:
Method_Type1 = Method_Type.drop('Parameters', 1)    #delete the Parameters column from the original dataframe
index = Param_Split.index.get_values()              #get the index of the parameter dataframe  
Method_Type2 = Method_Type1.set_index(index)          #set the index of method dataframe same as parameter dataframe
result = pd.concat([Method_Type2, Param_Split], axis = 1)    #finally add the parameter columns to get the result (desired format)
#result

In [120]:
# Split the Parameter Column to process it further and have only numeric values in it
data_split1 = pd.DataFrame(result.Param1.str.split('=').tolist(),
                                   columns = ['Param_Name','C'])
data_split2 = pd.DataFrame(result.Param2.str.split('=').tolist(),
                                   columns = ['Param_Name','penalty'])
#data_split3 = pd.DataFrame(result.Param3.str.split('=').tolist(),
#                                   columns = ['Param_Name','criterion'])
#data_split4 = pd.DataFrame(result.Param4.str.split('=').tolist(),
#                                   columns = ['Param_Name','kernel'])
#data_split5 = pd.DataFrame(result.Param5.str.split('=').tolist(),
#                                   columns = ['Param_Name','degree'])
#data_split6 = pd.DataFrame(result.Param5.str.split('=').tolist(),
#                                   columns = ['Param_Name','warm_start'])

# Delete the Parameters column from the original dataframe
method_data1 = result.drop('Param1',1)  
method_data1 = method_data1.drop('Param2',1)
#method_data1 = method_data1.drop('Param3',1)
#method_data1 = method_data1.drop('Param4',1)
#method_data1 = method_data1.drop('Param5',1)
#method_data1 = method_data1.drop('Param6',1)

data_split1 = data_split1.drop('Param_Name',1)
data_split2 = data_split2.drop('Param_Name',1)
#data_split3 = data_split3.drop('Param_Name',1)
#data_split4 = data_split4.drop('Param_Name',1)
#data_split5 = data_split5.drop('Param_Name',1)
#data_split6 = data_split6.drop('Param_Name',1)

idx = data_split1.index.get_values()               #get the index of the parameter dataframe  
method_data2 = method_data1.set_index(idx)         #set the index of method dataframe same as parameter dataframe
cleaned_data = pd.concat([method_data2, data_split1, data_split2], axis = 1)
#finally add the parameter columns to get the result (desired format)

# You must cast the data as a float type -- it was parsed into a string type
#cleaned_data['n_estimators'] = cleaned_data['n_estimators'].astype(float)
#cleaned_data['learning_rate'] = cleaned_data['learning_rate'].astype(float)
#cleaned_data['warm_start'] = cleaned_data['warm_start'].astype(float)
#cleaned_data['criterion'] = cleaned_data['criterion'].astype(float)
#cleaned_data['warm_start'] = cleaned_data['warm_start'].astype(float)

#cleaned_data

# Save this file of a method and parameters organized in columns


In [121]:
import os
if not os.path.isdir('Cleaned_Method_Wise_Data'):
    os.mkdir('Cleaned_Method_Wise_Data')
cleaned_data.to_pickle('Cleaned_Method_Wise_Data/LogisticRegression_cleaned.tsv.gz')

In [122]:
# Read the file
Cleaned_Data = pd.read_pickle('Cleaned_Method_Wise_Data/LogisticRegression_cleaned.tsv.gz')
Cleaned_Data

Unnamed: 0,Dataset_Name,Method_Name,Test_Score,C,penalty
0,labor,LogisticRegression,0.733333,0.1,l1
1,labor,LogisticRegression,0.800000,0.1,l1
2,labor,LogisticRegression,0.800000,0.1,l1
3,labor,LogisticRegression,0.866667,0.1,l1
4,labor,LogisticRegression,0.733333,0.1,l1
5,labor,LogisticRegression,0.800000,0.1,l1
6,labor,LogisticRegression,0.866667,0.1,l1
7,labor,LogisticRegression,0.800000,0.1,l1
8,labor,LogisticRegression,0.800000,0.1,l1
9,labor,LogisticRegression,1.000000,0.1,l1
