In [161]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import math
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

LOADING DATA

In [162]:
df_cpu = pd.read_csv('../model_data/cpuProductData.csv')
df_gpu = pd.read_csv('../model_data/gpuProductData.csv')

FUNCTIONS TO BE UTILISED THROUGHOUT

In [163]:
def clean_data(data, modelsDir):

    # Loading file with models
    models = pd.read_csv(modelsDir)
    models['model'] = models['model'].str.upper()

    # Renaming product model for consistency
    data['model'] = data['model'].str.upper()
    data['brand'] = data['brand'].str.upper()
    data['type'] = data['type'].str.upper()
    data['availability'] = data['availability'].str.upper()

    for dt in data.itertuples():
        for model in models.itertuples():
            if dt.model.find(str(model.model)) != -1:
                data.at[dt.Index, 'model'] = model.Index
                continue

    # Removing models which dont havent been renamed
    data = data.replace('', np.nan)
    data = data.dropna()


In [164]:
def encode_data(data):
    def split_date(date):
        year = int(date[:4])
        month = int(date[4:6])
        day_month = int(date[6:8])
        week = datetime.date(year, month, day_month).isocalendar()[1]
        day_week = datetime.date(year, month, day_month).isocalendar()[2]
        day_year = day_week * week
        quarter = math.ceil(float(month)/3)
        return year, month, quarter, week, day_year, day_month, day_week
    
    # Splitting data into diff components
    
    for dt in data.itertuples():
        year, month, quarter, week, day_year, day_month, day_week = split_date(str(dt.date))
        data.at[dt.Index, 'year'] = year
        data.at[dt.Index, 'month'] = month
        data.at[dt.Index, 'quarter'] = quarter
        data.at[dt.Index, 'week'] = week
        data.at[dt.Index, 'day_year'] = day_year
        data.at[dt.Index, 'day_month'] = day_month
        data.at[dt.Index, 'day_week'] = day_week    

    del data['date']

    label_encoder = LabelEncoder()
    data['brand'] = label_encoder.fit_transform(data['brand'])
    data['model'] = label_encoder.fit_transform(data['model'])
    data['type'] = label_encoder.fit_transform(data['type'])

    enc_brand = pd.get_dummies(data.brand, prefix='brand')
    del data['brand']
    data = pd.concat([data, enc_brand], axis=1)

    enc_model = pd.get_dummies(data.model, prefix='model')
    del data['model']
    data = pd.concat([data, enc_model], axis=1)

    enc_type = pd.get_dummies(data.type, prefix='type')
    del data['type']
    data = pd.concat([data, enc_type], axis=1)

    avail = set(data['availability'].str.upper())
    avail = pd.DataFrame(avail)
    avail = avail.rename(columns={0: 'availability'})

    for av in avail.itertuples():
        data.loc[data['availability'].str.upper() == av.availability, 'availability'] = av.Index
    
    return data

In [165]:
def get_sets(data):

    # shuffling data

    index = [i for i in range(data.shape[0])]
    random.shuffle(index)
    data = data.set_index([index]).sort_index()
    
    # split between train and test
    
    Y = data[['availability']]
    Y = np.array(Y).astype('float32')
    del data['availability']
    del data['model']
    X = np.array(data).astype('float32')
    x_train, x_test, y_train, y_test = train_test_split(X,Y)

    return x_train, x_test, y_train, y_test

MODEL IMPLEMENTATION

In [166]:
clean_data(df_cpu, '../model_data/cpuModels.csv')
clean_data(df_gpu, '../model_data/gpuModels.csv')
df_prods = df_cpu.append(df_gpu, ignore_index=True)

In [167]:
df_prods = encode_data(df_prods)

In [168]:
train_set = df_prods.sample(frac=0.8, random_state=5)
test_set = df_prods.drop(train_set.index)

x_train = train_set.copy()
x_test = test_set.copy()

y_train = x_train.pop('availability')
y_test = x_test.pop('availability')

In [172]:
x_train = np.array(x_train)
x_test = np.array(x_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

y_train = np.reshape(y_train, (-1, 1))
y_test = np.reshape(y_test, (-1, 1))

scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_x.fit(x_train)
x_train = scaler_x.transform(x_train)

scaler_x.fit(x_test)
x_test = scaler_x.transform(x_test)

scaler_y.fit(y_train)
y_train = scaler_y.transform(y_train)

scaler_y.fit(y_test)
y_test = scaler_y.transform(y_test)


In [177]:
data_length = len(x_train[0])

model = keras.Sequential([
    layers.Dense(data_length + 1, input_dim=data_length, kernel_initializer='normal', activation='relu'),
    layers.Dense(data_length*8, activation='relu'),
    layers.Dense(data_length*3, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(2, activation='softmax')
])

In [180]:
model.compile(
    optimizer=tf.optimizers.SGD(learning_rate=0.005, momentum=0.9, nesterov=True),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 158)               24964     
_________________________________________________________________
dense_9 (Dense)              (None, 1256)              199704    
_________________________________________________________________
dense_10 (Dense)             (None, 471)               592047    
_________________________________________________________________
dropout_2 (Dropout)          (None, 471)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 944       
Total params: 817,659
Trainable params: 817,659
Non-trainable params: 0
_________________________________________________________________


In [181]:
history = model.fit(
    x_train,
    y_train,
    epochs=500,
    verbose=0,
    batch_size=150,
    shuffle=True,
    validation_split=0.2
)

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.head()

Unnamed: 0,loss,accuracy,val_loss,val_accuracy,epoch
0,0.623642,0.721916,0.583189,0.723729,0
1,0.579074,0.722764,0.575301,0.723729,1
2,0.571301,0.722764,0.567537,0.723729,2
3,0.563792,0.722764,0.55968,0.723729,3
4,0.554679,0.722764,0.552003,0.723729,4
