In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv('train.csv', index_col='id')
df_test = pd.read_csv('test.csv', index_col='id')

In [3]:
import warnings

warnings.filterwarnings('ignore')

In [4]:
df_train.head()

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,e,8.8,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a


In [5]:
print(df_train.isnull().sum())

#percent of missing cells
(df_train.isnull().sum().sum()/(np.prod(df_train.shape)))*100

class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64


24.24307256288571

In [6]:
def cleaning(df):
    threshold = 101
    cat_feat = ["cap-shape", "cap-surface", "cap-color", "does-bruise-or-bleed", "gill-attachment",
                "gill-spacing", "gill-color", "stem-root", "stem-surface", "stem-color", 'veil-type', 
                'veil-color', "has-ring", "ring-type", "spore-print-color", "habitat", "season"]
    
    for i in cat_feat:
        if df[i].dtype.name == 'category':
            if 'missing' not in df[i].cat.categories:
                df[i] = df[i].cat.add_categories('missing')
            if 'noise' not in df[i].cat.categories:
                df[i] = df[i].cat.add_categories('noise')
        else:
            df[i] = df[i].astype('category')
            df[i] = df[i].cat.add_categories(['missing', 'noise'])
        
        df[i] = df[i].fillna('missing')
        
        count = df[i].value_counts(dropna=False)
        less_freq = count[count < threshold].index
        
        df[i] = df[i].apply(lambda x: 'noise' if x in less_freq else x)
    
    return df

In [7]:
#hotencoding

import csv
import math
#math.isnan
"""Data colums that need hot encoding
id,class,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,
stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
"""
train_DataFrame = pd.read_csv('train.csv', index_col= 'id')
train_DataFrame = cleaning(train_DataFrame)
#these should be all the categorical columns that need to be encoded, add any additional ones I missed if you find any
encode_columns = train_DataFrame[['class', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment',
                                  'gill-spacing','gill-color','stem-root','stem-surface', 'stem-color','veil-type', 
                                  'veil-color','has-ring','ring-type','spore-print-color','habitat','season']]


#rest of code in next code cell


In [8]:
encode_columns.isnull().sum()
encode_columns.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 18 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   class                 object
 1   cap-shape             object
 2   cap-surface           object
 3   cap-color             object
 4   does-bruise-or-bleed  object
 5   gill-attachment       object
 6   gill-spacing          object
 7   gill-color            object
 8   stem-root             object
 9   stem-surface          object
 10  stem-color            object
 11  veil-type             object
 12  veil-color            object
 13  has-ring              object
 14  ring-type             object
 15  spore-print-color     object
 16  habitat               object
 17  season                object
dtypes: object(18)
memory usage: 451.8+ MB


In [9]:
train_DataFrame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 object 
 1   cap-diameter          float64
 2   cap-shape             object 
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), object(18)
memory usage: 523.2+ MB


from sklearn.preprocessing import MultiLabelBinarizer

#one hot encoded

#dictionary has column key mapped to a dictionary that contains a "classes" key mapped to the different classes in the respective column
#and it has and "encoded" key mapped to the encoding of the column 
encoded_Cols = {}
print(encode_columns.columns.tolist())
for column in encode_columns.columns.tolist():
    print(column)
    mlb = MultiLabelBinarizer()
    encode = mlb.fit_transform(encode_columns[column])
    encoded_Cols[column] = {'classes': mlb.classes_,
                              'encoded' : encode}

In [10]:
#multi hot encoded

columns_multiEncoded = {}
for column in encode_columns.columns.tolist():
    map = {}
    categoricalVals = encode_columns[column].unique()
    for index in range(len(categoricalVals)):
        map[categoricalVals[index]] = index
    
    print(map)
    columns_multiEncoded[column] = map



{'e': 0, 'p': 1}
{'f': 0, 'x': 1, 'p': 2, 'b': 3, 'o': 4, 'c': 5, 's': 6, 'noise': 7}
{'s': 0, 'h': 1, 'y': 2, 'l': 3, 't': 4, 'e': 5, 'g': 6, 'missing': 7, 'd': 8, 'i': 9, 'w': 10, 'k': 11, 'noise': 12}
{'u': 0, 'o': 1, 'b': 2, 'g': 3, 'w': 4, 'n': 5, 'e': 6, 'y': 7, 'r': 8, 'p': 9, 'k': 10, 'l': 11, 'noise': 12}
{'f': 0, 't': 1, 'noise': 2}
{'a': 0, 'x': 1, 's': 2, 'd': 3, 'e': 4, 'missing': 5, 'f': 6, 'p': 7, 'noise': 8}
{'c': 0, 'missing': 1, 'd': 2, 'f': 3, 'noise': 4}
{'w': 0, 'n': 1, 'g': 2, 'k': 3, 'y': 4, 'f': 5, 'p': 6, 'o': 7, 'b': 8, 'u': 9, 'e': 10, 'r': 11, 'noise': 12}
{'missing': 0, 'b': 1, 'c': 2, 'r': 3, 's': 4, 'f': 5, 'noise': 6}
{'missing': 0, 'y': 1, 's': 2, 't': 3, 'g': 4, 'h': 5, 'k': 6, 'i': 7, 'f': 8, 'noise': 9}
{'w': 0, 'o': 1, 'n': 2, 'y': 3, 'e': 4, 'u': 5, 'p': 6, 'f': 7, 'g': 8, 'r': 9, 'k': 10, 'l': 11, 'b': 12, 'noise': 13}
{'missing': 0, 'u': 1, 'noise': 2}
{'missing': 0, 'n': 1, 'w': 2, 'k': 3, 'y': 4, 'e': 5, 'u': 6, 'noise': 7}
{'f': 0, 't': 1, 'no

In [11]:
X_train = cleaning(df_train.drop(df_train.columns[0], axis=1)) #multi-hot encode 
Y_train = df_train[df_train.columns[0]]



X_test = cleaning(df_test.drop(df_test.columns[0], axis=1)) #multi-hot encode 
Y_test = df_test[df_test.columns[0]]


Y_train[Y_train == 'e'] = 0

Y_train[Y_train == 'p'] = 1

Y_test[Y_test == 'e'] = 0

Y_test[Y_test == 'p'] = 1


# x is input y is output 
#test is for the final eval train is for the training of the model

In [16]:
 #multi-hot encode X test and X train
 
#  Apply the mapping to each column to replace characters with their corresponding integer
def encode_column(column, mapping):
    return column.apply(lambda entry: mapping[entry] if entry in mapping else np.nan)


# Apply encoding to each column except the classification column

X_train = X_train.apply(lambda col: encode_column(col, columns_multiEncoded[col.name]) 
                           if col.name in columns_multiEncoded else col)

X_test = X_test.apply(lambda col: encode_column(col, columns_multiEncoded[col.name]) 
                           if col.name in columns_multiEncoded else col)


In [15]:
print(X_train)

         cap-diameter  cap-shape  cap-surface  cap-color  \
id                                                         
0                8.80          0            0          0   
1                4.51          1            1          1   
2                6.94          0            0          2   
3                3.88          0            2          3   
4                5.85          1            3          4   
...               ...        ...          ...        ...   
3116940          9.29          0            7          5   
3116941         10.88          6            7          4   
3116942          7.82          1            5          6   
3116943          9.45          2            9          5   
3116944          3.20          1            0          3   

         does-bruise-or-bleed  gill-attachment  gill-spacing  gill-color  \
id                                                                         
0                           0                0             0       

In [13]:
def train_using_entropy(X, y):
    # Decision tree with entropy
    dft_entropy = DecisionTreeClassifier(
        criterion="entropy", random_state=100,
        max_depth=3, min_samples_leaf=5)

    # Performing training
    dft_entropy.fit(X, y)
    return dft_entropy

In [14]:
y_pred = train_using_entropy(X_train, Y_train).predict(X_test)
print("Accuracy : ", accuracy_score(Y_test, y_pred)*100)

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
import xgboost as xgb
# define data_dmatrix
#https://gist.github.com/pb111/cc341409081dffa5e9eaf60d79562a03
data_dmatrix = xgb.DMatrix(data=X_train,label=Y_train)

In [None]:
# import XGBClassifier
from xgboost import XGBClassifier


# declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }
            
            
            
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)



# fit the classifier to the training data
xgb_clf.fit(X_train, Y_train)

In [None]:
# make predictions on test data
y_pred2 = xgb_clf.predict(X_test)

In [None]:

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(Y_test, y_pred)))