In [63]:
import pandas as pd
import optuna
from catboost import CatBoostClassifier
import lightgbm as lgbm
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/sample_submission.csv")

In [None]:
test.head()

In [None]:
train.head()

In [None]:
train.set_index("id", drop=True, inplace=True)
y = train['class']
X = train.drop('class', axis = 1)
X_test = test.drop('id', axis = 1)


In [42]:
X_test.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.64,x,,n,t,,,w,11.13,17.12,b,,w,u,w,t,g,,d,a
1,6.9,o,t,o,f,,c,y,1.27,10.75,,,n,,,f,f,,d,a
2,2.0,b,g,n,f,,c,n,6.18,3.14,,,n,,,f,f,,d,s
3,3.47,x,t,n,f,s,c,n,4.98,8.51,,,w,,n,t,z,,d,u
4,6.17,x,h,y,f,p,,y,6.73,13.7,,,y,,y,t,,,d,u


In [53]:
categorical_columns = X.select_dtypes(include=['object']).columns


In [54]:
# from eda and kaggle notebook, these are what the values should be
category_mappings = {
    'season': ['a', 'u', 'w', 's'],
    'cap-shape': ['x', 'f', 's', 'b', 'o'],
    'cap-surface': ['t', 's', 'y', 'h', 'g'],
    'cap-color': ['n', 'y', 'w', 'g', 'e'],
    'does-bruise-or-bleed': ['f', 't'],
    'gill-attachment': ['a', 'd', 'x', 'e', 's'],
    'gill-spacing': ['c', 'd', 'f'],
    'gill-color': ['w', 'n', 'y', 'p', 'g'],
    'stem-root': ['b', 's', 'r', 'c', 'f'],
    'stem-surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
    'stem-color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b'],
    'veil-type': ['u', 'w'],
    'veil-color': ['w', 'y', 'n', 'u', 'k', 'e'],
    'has-ring': ['f', 't'],
    'ring-type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
    'spore-print-color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
    'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
}

In [59]:
#cleans all my categorical data. 
def clean_category(column, df, valid_category,threshold):
    df[column] = df[column].astype(str)
    counts = df[column].value_counts(normalize = True)


    def map_category(value):
        if value.replace('.','').isdigit():
            return 'Other'

        elif value.lower() in [cat.lower() for cat in valid_category]:
            return next(cat for cat in valid_category if cat.lower() == value.lower())
        elif counts.get(value,0) < threshold:
            return 'Other'
        else:
            return value
    df[column] = df[column].apply(map_category)

    return df
    

In [60]:
# rarity threshold about 0.1% of 3 million rows ~ 3,000, have to also augment test data
for column, valid in category_mappings.items():
    X  = clean_category(column,X,valid,0.001)
    X_test = clean_category(column,X_test,valid,0.001)

In [61]:
cat_feats = list(category_mappings.keys())

cat_feats

['season',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat']

In [None]:
#catboost hypertuned from optuna
stratkfold = StratifiedKFold(n_splits = 5, random_state = 0, shuffle = True)

catboost_params= {'iterations': 3979, 'depth': 14, 
                  'learning_rate': 0.04794429552561989, 'random_strength': 5, 
                  'bagging_temperature': 0.6839643118243162, 'od_type': 
                  'IncToDec', 'l2_leaf_reg': 8.112600124584425, 
                  'border_count': 247,
                 'eval_metric': 'MCC',
                 'random_seed': 0}

cat_model = CatBoostClassifier(**catboost_params)


for train,test in stratkfold.split(X,y):
    x_train, x_test = X.iloc[train], X.iloc[test]
    y_train, y_test = y.iloc[train], y.iloc[test]
    cat_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=300, verbose=100,cat_features =cat_feats)
    predictions = cat_model.predict(x_test)

    mcc(y_test,predictions)


In [None]:

#lgbm, hypertuned on kaggle P100, bottle necked by 12 hour restraint

#lgbm wants category types...
for column,valid in category_mappings.items():
    X[column] = X[column].astype('category')
    X_test[column] = X_test[column].astype('category')
    
lgbm_params = {'n_estimators': 6063, 'learning_rate': 0.12359835539214457, 
               'num_leaves': 2480, 'max_depth': 12, 'min_data_in_leaf': 8500, 
               'lambda_l1': 15,  'lambda_l2': 80, 
               'min_gain_to_split': 0.3736139231328375, 
               'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.4}
lgbm_model = lgbm.LGBMClassifier(objective = "binary", **lgbm_params,random_state = 0,verbosity = -1)

for train,test in stratkfold.split(X,y):
    x_train, x_test = X.iloc[train], X.iloc[test]
    y_train, y_test = y.iloc[train], y.iloc[test]
    lgbm_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], callbacks=[
                    lgbm.early_stopping(stopping_rounds=300),
                    lgbm.log_evaluation(period=0)
                ],eval_metric = 'mcc') 
    predictions = lgbm_model.predict(x_test)
    mcc(y_test,predictions)



In [None]:
#Voting ensamble method

estimators = [
    ('catboost', cat_model),
    ('lightgbm', lgbm_model)
]

voter = VotingClassifier(estimators=estimators, voting='soft')
voter.fit(X,y)

preds = voter.predict(X_test)





In [None]:
#submission
submission['class'] = preds
submission.to_csv("submission.csv",index = False)
submission.head()