In [132]:
# libraries import
from keras.models import Sequential
from keras import layers
from keras.models import Model
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# file import
import data_cleaner as dc
import model_helper as mh

class BestModel:
    def __init__(self, neuron=330, min_df = 0):
        self.df = dc.clean_item_data(0)
        self.df = dc.cleanup_categoryid(self.df)

        # vectorize training input data
        _X_train, _X_valid, _X_test, Y_train, Y_valid, Y_test = dc.data_split(self.df, 0.65, 0.15, 0.20)
        self.vectorizer = CountVectorizer(encoding='latin1', min_df = min_df) # Allow different options (min_df, encoding)

        # convert pandas dataframes to list of strings
        x_train_list = []
        x_test_list = []
        x_valid_list = []
        for _, row in _X_train.iterrows():
            x_train_list.append(row[0])
        for _, row in _X_test.iterrows():
            x_test_list.append(row[0])
        for _, row in _X_valid.iterrows():
            x_valid_list.append(row[0])

        self.vectorizer.fit(x_train_list)
        X_train = self.vectorizer.transform(x_train_list)
        X_test = self.vectorizer.transform(x_test_list)
        X_valid = self.vectorizer.transform(x_valid_list)

        # Neural Network
        print('X train shape: ' + str(X_train.shape[1]))
        input_dim = X_train.shape[1] # Number of features
        output_dim = self.df['categoryId'].nunique()
        model = Sequential()
        model.add(layers.Dense(neuron, input_dim=input_dim, activation='relu', use_bias=False))
        model.add(layers.Dropout(rate=0.6))
        model.add(layers.Dropout(rate=0.6))
        model.add(layers.Dense(output_dim, activation='softmax'))
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        history = model.fit(X_train, Y_train,
                            epochs=1,
                            verbose=1,
                            validation_data=(X_valid, Y_valid),
                            batch_size=10)
        #print(model.summary())

        loss, self.train_accuracy = model.evaluate(X_train, Y_train, verbose=False)
        loss, self.test_accuracy = model.evaluate(X_test, Y_test, verbose=False)
        self.model = model
        
    def get_accuracy(self):
        return (round(self.train_accuracy, 4), round(self.test_accuracy, 4))
    
    def get_category(self,s):
        s_arr = np.array([s])
        vector = self.vectorizer.transform(s_arr) 
        return self.model.predict_classes(vector)

In [133]:
bm = BestModel()



X train shape: 8912
Train on 7066 samples, validate on 1663 samples
Epoch 1/1


In [134]:
bm.get_accuracy()

(0.8452, 0.779)

In [135]:
bm.get_category('lamp light battery')

array([37])