# Mushroom to TOPKWY

This notebook contains the code to convert the Mushrooms dataset as provided by the UCI Machine Learning Repository and the train/test split used in E2 to the format used in the TopKWY original implementation.

In [165]:
import pandas as pd

In [166]:
df = pd.read_csv('../data/agaricus-lepiota.data', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [167]:
col_names = [ "poisonous",
        "cap-shape", "cap-surface", "cap-color", "bruises", "odor",
        "gill-attachment", "gill-spacing", "gill-size", "gill-color",
        "stalk-shape", "stalk-root", "stalk-surface-above-ring",
        "stalk-surface-below-ring", "stalk-color-above-ring",
        "stalk-color-below-ring", "veil-type", "veil-color",
        "ring-number", "ring-type", "spore-print-color",
        "population", "habitat"]

In [168]:
cls = df[0]
df = df.drop(0, axis=1)

In [169]:
df[1].unique()

array(['x', 'b', 's', 'f', 'k', 'c'], dtype=object)

In [170]:
target = "e"
cls = cls.apply(lambda x: 1 if x == target else 0)

In [171]:
# Convert dataframe values from str to integer labels in order of appearance
diff_values = 0
categories = {}
translation_dict = {}
for col in df.columns:
    appearance_order = df[col].unique()
    appearance_order = {appearance_order[i]: i+1+diff_values for i in range(len(appearance_order))}
    for key, value in appearance_order.items():
        translation_dict[value] = f"{col_names[col]} = {key}"
    categories[col] = appearance_order
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.rename_categories(appearance_order)
    diff_values += len(appearance_order)
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,1,7,11,21,23,32,34,36,38,50,...,61,65,74,83,84,88,91,96,105,111
1,1,7,12,21,24,32,34,37,38,50,...,61,65,74,83,84,88,91,97,106,112
2,2,7,13,21,25,32,34,37,39,50,...,61,65,74,83,84,88,91,97,106,113
3,1,8,13,21,23,32,34,36,39,50,...,61,65,74,83,84,88,91,96,105,111
4,1,7,14,22,26,32,35,37,38,51,...,61,65,74,83,84,88,92,97,107,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,5,7,11,22,26,33,34,37,48,50,...,61,71,81,83,86,88,91,104,110,117
8120,1,7,11,22,26,33,34,37,48,50,...,61,71,81,83,85,88,91,104,108,117
8121,4,7,11,22,26,33,34,37,39,50,...,61,71,81,83,86,88,91,104,110,117
8122,5,8,11,22,29,32,34,36,46,51,...,64,65,74,83,84,88,92,100,108,117


In [145]:
diff_values

117

In [146]:
cls.to_csv("./mushroom.labels", index=False, header=False)
df.to_csv("./mushroom.trans", index=False, header=False, sep = " ")

In [147]:
df_train = pd.read_csv("../data/agaricus-lepiota-train.csv")
df_test = pd.read_csv("../data/agaricus-lepiota-test.csv")
df_train.columns = df_train.columns.astype(int)
df_test.columns = df_test.columns.astype(int)

In [148]:
train_cls = df_train[0]
test_cls = df_test[0]
df_train = df_train.drop(0, axis=1)
df_test = df_test.drop(0, axis=1)
train_cls = train_cls.apply(lambda x: 1 if x == target else 0)
test_cls = test_cls.apply(lambda x: 1 if x == target else 0)

In [149]:
for col in df_train.columns:
    df_train[col] = df_train[col].astype('category')
    df_train[col] = df_train[col].cat.rename_categories(categories[col])
    df_test[col] = df_test[col].astype('category')
    df_test[col] = df_test[col].cat.rename_categories(categories[col])

In [152]:
df_train.to_csv("./mushroom.train.trans", index=False, header=False, sep = " ")
train_cls.to_csv("./mushroom.train.labels", index=False, header=False)
df_test.to_csv("./mushroom.test.trans", index=False, header=False, sep = " ")
test_cls.to_csv("./mushroom.test.labels", index=False, header=False)

In [155]:
with open("./mushroom.train.spec","w") as f:
    f.write("mushroom.train.trans\n")
    f.write(f"{diff_values+1}\n")
    f.write(f"{df_train.shape[1]}\n")
    f.write(f"{df_train.shape[0]}\n")
    f.write("mushroom.train.labels\n")

with open("./mushroom.test.spec","w") as f:
    f.write("mushroom.test.trans\n")
    f.write(f"{diff_values+1}\n")
    f.write(f"{df_test.shape[1]}\n")
    f.write(f"{df_test.shape[0]}\n")
    f.write("mushroom.test.labels\n")

In [174]:
with open("./translation_dict.txt","w") as f:
    f.write(str(translation_dict))