In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import math
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

LOADING DATA

In [147]:
df_cpu = pd.read_csv('../model_data/cpuProductData.csv')
df_gpu = pd.read_csv('../model_data/gpuProductData.csv')

FUNCTIONS TO BE UTILISED THROUGHOUT

In [148]:
def clean_data(data, modelsDir):

    # Loading file with models
    models = pd.read_csv(modelsDir)
    models['model'] = models['model'].str.upper()

    # Renaming product model for consistency
    data['model'] = data['model'].str.upper()
    data['brand'] = data['brand'].str.upper()
    data['type'] = data['type'].str.upper()
    data['availability'] = data['availability'].str.upper()

    for dt in data.itertuples():
        for model in models.itertuples():
            if dt.model.find(str(model.model)) != -1:
                data.at[dt.Index, 'model'] = model.Index
                continue

    # Removing models which dont havent been renamed
    data = data.replace('', np.nan)
    data = data.dropna()


In [149]:
def encode_data(data):
    def split_date(date):
        year = int(date[:4])
        month = int(date[4:6])
        day_month = int(date[6:8])
        week = datetime.date(year, month, day_month).isocalendar()[1]
        day_week = datetime.date(year, month, day_month).isocalendar()[2]
        day_year = day_week * week
        quarter = math.ceil(float(month)/3)
        return year, month, quarter, week, day_year, day_month, day_week
    
    # Splitting data into diff components
    
    for dt in data.itertuples():
        year, month, quarter, week, day_year, day_month, day_week = split_date(str(dt.date))
        data.at[dt.Index, 'year'] = year
        data.at[dt.Index, 'month'] = month
        data.at[dt.Index, 'quarter'] = quarter
        data.at[dt.Index, 'week'] = week
        data.at[dt.Index, 'day_year'] = day_year
        data.at[dt.Index, 'day_month'] = day_month
        data.at[dt.Index, 'day_week'] = day_week    

    del data['date']

    label_encoder = LabelEncoder()
    data['brand'] = label_encoder.fit_transform(data['brand'])
    data['model'] = label_encoder.fit_transform(data['model'])
    data['type'] = label_encoder.fit_transform(data['type'])

    enc_brand = pd.get_dummies(data.brand, prefix='brand')
    del data['brand']
    data = pd.concat([data, enc_brand], axis=1)

    enc_model = pd.get_dummies(data.model, prefix='model')
    del data['model']
    data = pd.concat([data, enc_model], axis=1)

    enc_type = pd.get_dummies(data.type, prefix='type')
    del data['type']
    data = pd.concat([data, enc_type], axis=1)

    avail = set(data['availability'].str.upper())
    avail = pd.DataFrame(avail)
    avail = avail.rename(columns={0: 'availability'})

    for av in avail.itertuples():
        data.loc[data['availability'].str.upper() == av.availability, 'availability'] = av.Index
    
    return data

In [150]:
def get_sets(data):

    # shuffling data

    index = [i for i in range(data.shape[0])]
    random.shuffle(index)
    data = data.set_index([index]).sort_index()
    
    # split between train and test
    
    Y = data[['availability']]
    Y = np.array(Y).astype('float32')
    del data['availability']
    del data['model']
    X = np.array(data).astype('float32')
    x_train, x_test, y_train, y_test = train_test_split(X,Y)

    return x_train, x_test, y_train, y_test

MODEL IMPLEMENTATION

In [151]:
clean_data(df_cpu, '../model_data/cpuModels.csv')
clean_data(df_gpu, '../model_data/gpuModels.csv')
df_prods = df_cpu.append(df_gpu, ignore_index=True)

In [152]:
df_prods = encode_data(df_prods)

In [157]:
train_set = df_prods.sample(frac=0.8, random_state=5)
test_set = df_prods.drop(train_set.index)

train_features = train_set.copy()
test_features = test_set.copy()

train_labels = train_features.pop('availability')
test_labels = test_features.pop('availability')

Unnamed: 0,price,year,month,quarter,week,day_year,day_month,day_week,brand_0,brand_1,...,model_109,model_110,model_111,model_112,model_113,model_114,model_115,model_116,type_0,type_1
857,2899,2018.0,8.0,3.0,33.0,231.0,19.0,7.0,0,0,...,0,0,0,0,0,0,0,0,1,0
1302,11999,2019.0,1.0,1.0,3.0,12.0,17.0,4.0,1,0,...,0,0,0,0,0,0,0,0,1,0
2706,12499,2021.0,5.0,2.0,19.0,133.0,16.0,7.0,1,0,...,0,0,0,0,0,0,0,0,1,0
4666,16699,2018.0,12.0,4.0,1.0,1.0,31.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
649,6099,2018.0,7.0,3.0,27.0,54.0,3.0,2.0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,25609,2019.0,4.0,2.0,17.0,102.0,27.0,6.0,0,0,...,0,0,0,0,0,0,0,0,1,0
6880,10499,2020.0,11.0,4.0,45.0,270.0,7.0,6.0,0,0,...,0,0,0,0,0,0,0,0,0,1
550,4549,2018.0,6.0,2.0,23.0,46.0,5.0,2.0,1,0,...,0,0,0,0,0,0,0,0,1,0
6116,12299,2019.0,8.0,3.0,32.0,224.0,11.0,7.0,0,0,...,0,0,0,0,0,0,0,0,0,1
