In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from keras import layers
from feature_engine.encoding import OrdinalEncoder
import joblib




In [2]:
def ModelBuilder():
    inputs = keras.Input(shape = (18))
    X1 = layers.Dense(18,activation = "relu")(inputs)
    X2 = layers.Dense(36,activation = "relu")(X1)
    X3 = layers.Dense(18,activation = "relu")(X2)
    X4 = layers.Dense(28,activation = "relu")(X3 + X1)
    X5 = layers.Dense(20,activation = "relu")(X4)
    X6 = layers.Dense(20,activation = "relu")(X5)
    X6 = layers.Dropout(0.3)(X6)
    X6 = layers.Dense(16,activation = "relu")(X5 + X6)
    X7 = layers.Dense(10,activation = "relu")(X6)
    X8 = layers.Dense(10,activation = "relu")(X7)
    X8 = layers.Dropout(0.3)(X8)
    X9 = layers.Dense(5,activation = "relu")(X8 + X7)
    output = layers.Dense(1,activation = "sigmoid")(X9)
    model = keras.Model(inputs = inputs, outputs = output)
    return model

In [3]:
class CustomLearningRateScheduler(tf.keras.callbacks.Callback):
    def __init__(self, loss_limits:list, lr_reduction_factor, max_lr=1e-7):
        super(CustomLearningRateScheduler, self).__init__()
        self.loss_limits = loss_limits
        self.lr_reduction_factor = lr_reduction_factor
        self.max_lr = max_lr

    def on_epoch_end(self, epoch, logs=None):
        current_loss = logs.get('val_loss')
        current_lr = float(tf.keras.backend.get_value(self.model.optimizer.lr))
        
        if  len(self.loss_limits)!=0 and current_loss < self.loss_limits[0][0] and current_loss > self.loss_limits[0][1]:
            new_lr = min(current_lr * self.lr_reduction_factor, self.max_lr)
            tf.keras.backend.set_value(self.model.optimizer.lr, new_lr)
            print(f"lr : {new_lr}")
        elif len(self.loss_limits)!=0 and current_loss < self.loss_limits[0][1] and len(self.loss_limits)!=1:
            self.loss_limits.pop(0)

In [4]:
lr_custom_callback = CustomLearningRateScheduler(
    loss_limits = [(0.09,0.08),(0.07,0.06)],
    lr_reduction_factor = 1.03,
    max_lr = 3e-3
)

In [5]:
lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor = 'loss',
    factor = 0.85,
    patience = 3,
    min_lr = 0.8e-3
)

In [6]:
model = ModelBuilder()




In [7]:
model.load_weights("MASHROOM_CLASSIFICATION/cp.ckpt")




<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x232911ccc90>

In [8]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 18)]                 0         []                            
                                                                                                  
 dense (Dense)               (None, 18)                   342       ['input_1[0][0]']             
                                                                                                  
 dense_1 (Dense)             (None, 36)                   684       ['dense[0][0]']               
                                                                                                  
 dense_2 (Dense)             (None, 18)                   666       ['dense_1[0][0]']             
                                                                                              

Now lets build pipeline

In [50]:
class ModelPipeLine():
    def __init__(self, X_data, y_data, imputer_model, minmax_scaler, ordinal_encoder, cats:dict, frequencies:dict):
        self.X = X_data
        self.y = y_data
        self.imputer = imputer_model
        self.minmax_scaler = minmax_scaler
        self.ordinal_encoder = ordinal_encoder
        self.cats = cats
        self.frequencies = frequencies

    def __drops(self):
        #drop useless features
        self.X.drop(columns = ['veil-type','veil-color','id'], inplace = True)
        self.y.drop(columns = ['id'],inplace = True)

        #Null values
        columns = ['gill-color','cap-diameter','cap-shape','cap-color','does-bruise-or-bleed','stem-color','has-ring','habitat','ring-type','stem-height']
        for i in columns:
            self.X.fillna({i:self.X[i].value_counts().index.tolist()[0]},inplace = True)
        #********
        print("few nulls:",self.X.isna().sum())

        self.categorical_vars = [
             var for var in self.X.select_dtypes(
             include="O").columns.to_list() if var !="target"
             ]
        
        #Dropping rare categories
        for i in self.cats.keys():
            self.X.loc[~self.X[i].isin(self.cats[i]),i] = 'unkown'
        
        #********
        print("cats:",self.X.isna().sum())


    
    def __ImputingNullValues(self):
        self.X['stem-root'] = self.X['stem-root'].fillna('unkown')
        self.X['spore-print-color'] = self.X['spore-print-color'].fillna('unkown')
        self.X['stem-surface'] = self.X['stem-surface'].fillna('unkown')
        self.X['gill-spacing'] = self.X['gill-spacing'].fillna('unkown')

        #********
        print("fiilna:",self.X.isna().sum())

        #Freqency Encoding
        Encode1  = self.categorical_vars.copy()
        Encode1.remove('cap-surface')
        Encode1.remove('gill-attachment')
        
        self.X2 = self.X.copy()
        for i in Encode1:
            frequency = self.frequencies[i]
            self.X[i] = self.X[i].map(frequency)
        
        
        #********
        print("frequency:",self.X.isna().sum())

        #error point
        for j in self.categorical_vars:
            if 'unkown' in self.X[j]:
               self.X[j] = self.X[j].replace({'unkown':0})

        #Imputing with Randomforest model

        self.X = self.ordinal_encoder.transform(self.X)
        #******
        print("ordinal_encoder:",self.X.isna().sum())

        x = self.X.drop('cap-surface', axis=1)
        y = self.X['cap-surface'].copy()

        X_missing = x[y.isna()]
        if len(X_missing)>0:
           self.X.loc[y.isna(),'cap-surface'] = self.imputer.predict(X_missing)
        #*****
        print("imputer:",self.X.isna().sum())
        
        #Imputing and encoding what remained
        frequency = self.frequencies['cap-surface']
        self.X['cap-surface'] = self.X['cap-surface'].map(frequency)

        self.X = self.X.fillna(0)
        
        frequency = self.frequencies['gill-attachment']
        self.X['gill-attachment'] = self.X['gill-attachment'].map(frequency)
    

    def __ScalingNumericalVars(self):
        self.X[['cap-diameter', 'stem-height', 'stem-width']] = self.minmax_scaler.transform(self.X[['cap-diameter', 'stem-height', 'stem-width']])
    

    def __LabelEncoding(self):
        labels = {"p":1 ,"e":0}
        self.y.replace(labels, inplace=True)
    
    def transform(self):
        self.__drops()
        #try:
        self.__ImputingNullValues()
        #except: 
          #unkowns = []
          #for i in self.categorical_vars:
              #l = self.X[i].value_counts()
              #print(l['unkown'])
          #return None,None
        self.__ScalingNumericalVars()
        self.__LabelEncoding()
        X = tf.convert_to_tensor(self.X.values)
        y = tf.convert_to_tensor(self.y.values)
        y = tf.cast(y,dtype=tf.float64)
        X = tf.cast(X,dtype=tf.float64)
        return X,y,self.X,self.X2


        

In [10]:
import pickle
imputer = pickle.load(open("imputer.pkl", 'rb'))

In [11]:
MINMAX_SCALER = joblib.load('minmax_scaler.pkl')

In [12]:
OrdinalEncoder = joblib.load('ordinal_encoder.pkl')

In [13]:
frequencies = joblib.load('frequencies')

In [14]:
Xtest = pd.read_csv('test/test.csv')
ytest = pd.read_csv('sample_submission/sample_submission.csv')

In [44]:
categories = {'cap-shape': ['x', 'o', 'b', 'f', 's', 'p', 'c'],
 'cap-surface': [np.nan, 'g', 'k', 's', 'y', 'e', 'i', 'd', 'w', 'h', 't', 'l'],
 'cap-color': ['w', 'e', 'n', 'u', 'y', 'l', 'g', 'r', 'p', 'o', 'k', 'b'],
 'does-bruise-or-bleed': ['f', 't'],
 'gill-attachment': ['x', 'a', np.nan, 's', 'd', 'e', 'p', 'f'],
 'gill-spacing': ['c', 'unkown', 'f', 'd'],
 'gill-color': ['p', 'n', 'o', 'w', 'y', 'f', 'g', 'r', 'k', 'b', 'u', 'e'],
 'stem-root': ['unkown', 's', 'c', 'b', 'r', 'f'],
 'stem-surface': ['unkown', 'h', 'y', 's', 't', 'i', 'g', 'k', 'f'],
 'stem-color': ['p',
  'y',
  'w',
  'n',
  'g',
  'r',
  'e',
  'o',
  'u',
  'l',
  'k',
  'b',
  'f'],
 'has-ring': ['f', 't'],
 'ring-type': ['f', 'r', 'l', 'e', 'z', 'p', 'g', 'm'],
 'spore-print-color': ['unkown', 'p', 'k', 'w', 'n', 'g', 'r', 'u'],
 'habitat': ['l', 'd', 'm', 'g', 'h', 'w', 'p', 'u'],
 'season': ['a', 's', 'u', 'w']}

In [51]:
PipeLine = ModelPipeLine(
    X_data= Xtest.copy(),
    y_data= ytest.copy(),
    imputer_model= imputer,
    minmax_scaler= MINMAX_SCALER,
    ordinal_encoder= OrdinalEncoder,
    cats= categories,
    frequencies= frequencies
)

In [52]:
X_test, y_test,px,px2 =  PipeLine.transform()

few nulls: cap-diameter                  0
cap-shape                     0
cap-surface              446904
cap-color                     0
does-bruise-or-bleed          0
gill-attachment          349821
gill-spacing             839595
gill-color                    0
stem-height                   0
stem-width                    0
stem-root               1838012
stem-surface            1321488
stem-color                    0
has-ring                      0
ring-type                     0
spore-print-color       1899617
habitat                       0
season                        0
dtype: int64
cats: cap-diameter                 0
cap-shape                    0
cap-surface             446904
cap-color                    0
does-bruise-or-bleed         0
gill-attachment         349821
gill-spacing                 0
gill-color                   0
stem-height                  0
stem-width                   0
stem-root                    0
stem-surface                 0
stem-color            



ordinal_encoder: cap-diameter                 0
cap-shape                    0
cap-surface             447295
cap-color                    0
does-bruise-or-bleed         0
gill-attachment         350143
gill-spacing                 0
gill-color                   0
stem-height                  0
stem-width                   0
stem-root                    0
stem-surface                 0
stem-color                   0
has-ring                     0
ring-type                    0
spore-print-color            0
habitat                      0
season                       0
dtype: int64
imputer: cap-diameter                 0
cap-shape                    0
cap-surface                  0
cap-color                    0
does-bruise-or-bleed         0
gill-attachment         350143
gill-spacing                 0
gill-color                   0
stem-height                  0
stem-width                   0
stem-root                    0
stem-surface                 0
stem-color                   0


  self.y.replace(labels, inplace=True)


In [53]:
px.isna().sum()

cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-root               0
stem-surface            0
stem-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64

In [59]:
X = model.predict(X_test)



In [99]:
ytest['class'] = (X > 0.5).astype(int)

In [100]:
ytest['class'] = ytest['class'].replace({1:'p',0:'e'})
ytest

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e


In [102]:
ytest.to_csv("result.csv",header=True,index=False)