In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import math
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

LOADING DATA

In [141]:
df_cpu = pd.read_csv('../model_data/cpuProductData.csv')
df_gpu = pd.read_csv('../model_data/gpuProductData.csv')

CLEANING DATA

In [142]:
def clean_data(data, modelsDir):

    def split_date(date):
        year = int(date[:4])
        month = int(date[4:6])
        day_month = int(date[6:8])
        week = datetime.date(year, month, day_month).isocalendar()[1]
        day_week = datetime.date(year, month, day_month).isocalendar()[2]
        day_year = day_week * week
        quarter = math.ceil(float(month)/3)
        return year, month, quarter, week, day_year, day_month, day_week

    # Loading file with models
    models = pd.read_csv(modelsDir)

    # Renaming product model for consistency
    data['model'] = data['model'].str.upper()
    data['brand'] = data['brand'].str.upper()
    data['type'] = data['type'].str.upper()
    data['availability'] = data['availability'].str.upper()

    for dt in data.itertuples():
        for model in models.itertuples():
            if dt.model.find(str(model.model)) != -1:
                data.at[dt.Index, 'model'] = str(model.model)
                continue    

    # Removing models which dont havent been renamed
    data = data.replace('', np.nan)
    data = data.dropna()

    

    # Splitting data into diff components
    
    for dt in data.itertuples():
        year, month, quarter, week, day_year, day_month, day_week = split_date(str(dt.date))
        data.at[dt.Index, 'year'] = year
        data.at[dt.Index, 'month'] = month
        data.at[dt.Index, 'quarter'] = quarter
        data.at[dt.Index, 'week'] = week
        data.at[dt.Index, 'day_year'] = day_year
        data.at[dt.Index, 'day_month'] = day_month
        data.at[dt.Index, 'day_week'] = day_week    

    label_encoder = LabelEncoder()
    data['brand'] = label_encoder.fit_transform(data['brand'])
    data['model'] = label_encoder.fit_transform(data['model'])
    # data['type'] = label_encoder.fit_transform(data['type'])
    enc_brand = pd.get_dummies(data.brand, prefix='brand')
    data = pd.concat([data, enc_brand], axis=1)
    enc_model = pd.get_dummies(data.model, prefix='model')
    data = pd.concat([data, enc_model], axis=1)
    # enc_type = pd.get_dummies(data.type, prefix='type')
    # data = pd.concat([data, enc_type], axis=1)

    avail = set(data['availability'].str.upper())
    avail = pd.DataFrame(avail)
    avail = avail.rename(columns={0: 'availability'})

    for av in avail.itertuples():
        data.loc[data['availability'].str.upper() == av.availability, 'availability'] = av.Index
        
    return data


In [143]:
def get_sets(data):

    # shuffling data

    index = [i for i in range(data.shape[0])]
    random.shuffle(index)
    data = data.set_index([index]).sort_index()
    
    # split between train and test
    
    Y = data[['availability']]
    Y = np.array(Y).astype('float32')
    del data['availability']
    del data['model']
    X = np.array(data).astype('float32')
    x_train, x_test, y_train, y_test = train_test_split(X,Y)

    return x_train, x_test, y_train, y_test

In [144]:
df_cpu = clean_data(df_cpu, '../model_data/cpuModels.csv')
df_gpu = clean_data(df_gpu, '../model_data/gpuModels.csv')

Unnamed: 0,brand,model,price,availability,date,type,year,month,quarter,week,...,model_125,model_126,model_127,model_128,model_129,model_130,model_131,model_132,model_133,model_134
0,1,74,32999,0,20171231072512,CPU,2017.0,12.0,4.0,52.0,...,0,0,0,0,0,0,0,0,0,0
1,1,73,27999,0,20171231072512,CPU,2017.0,12.0,4.0,52.0,...,0,0,0,0,0,0,0,0,0,0
2,1,72,22999,0,20171231072512,CPU,2017.0,12.0,4.0,52.0,...,0,0,0,0,0,0,0,0,0,0
3,0,124,18299,0,20171231072512,CPU,2017.0,12.0,4.0,52.0,...,0,0,0,0,0,0,0,0,0,0
4,1,71,16199,0,20171231072512,CPU,2017.0,12.0,4.0,52.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,0,95,3299,1,20210702153249,CPU,2021.0,7.0,3.0,26.0,...,0,0,0,0,0,0,0,0,0,0
2778,0,93,2179,1,20210702153249,CPU,2021.0,7.0,3.0,26.0,...,0,0,0,0,0,0,0,0,0,0
2779,1,14,1849,1,20210702153249,CPU,2021.0,7.0,3.0,26.0,...,0,0,0,0,0,0,0,0,0,0
2780,0,6,1499,1,20210702153249,CPU,2021.0,7.0,3.0,26.0,...,0,0,0,0,0,0,0,0,0,0
