In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import math
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

LOADING DATA

In [3]:
df_cpu = pd.read_csv('../model_data/cpuProductData.csv')
df_gpu = pd.read_csv('../model_data/gpuProductData.csv')

CLEANING DATA

In [4]:
def clean_data(data, modelsDir):

    def split_date(date):
        year = int(date[:4])
        month = int(date[4:6])
        day_month = int(date[6:8])
        week = datetime.date(year, month, day_month).isocalendar()[1]
        day_week = datetime.date(year, month, day_month).isocalendar()[2]
        day_year = day_week * week
        quarter = math.ceil(float(month)/3)
        return year, month, quarter, week, day_year, day_month, day_week

    # Loading file with models
    models = pd.read_csv(modelsDir)

    # Renaming product model for consistency
    data['model'] = data['model'].str.upper()
    data['brand'] = data['brand'].str.upper()
    data['availability'] = data['availability'].str.upper()
    data['model_name'] = ''

    for dt in data.itertuples():
        for model in models.itertuples():
            if dt.model.find(str(model.model)) != -1:
                data.at[dt.Index, 'model'] = str(model.model)
                continue

    # Removing models which dont havent been renamed
    data = data.replace('', np.nan)
    data = data.dropna()

    # Splitting data into diff components
    
    for dt in data.itertuples():
        year, month, quarter, week, day_year, day_month, day_week = split_date(str(dt.date))
        data.at[dt.Index, 'year'] = year
        data.at[dt.Index, 'month'] = month
        data.at[dt.Index, 'quarter'] = quarter
        data.at[dt.Index, 'week'] = week
        data.at[dt.Index, 'day_year'] = day_year
        data.at[dt.Index, 'day_month'] = day_month
        data.at[dt.Index, 'day_week'] = day_week

    label_encoder = LabelEncoder()
    data['brand'] = label_encoder.fit_transform(data['brand'])
    data['model_name'] = label_encoder.fit_transform(data['model_name'])
    enc_brand = pd.get_dummies(data.brand, prefix='b')
    data = pd.concat([data, enc_brand], axis=1)
    enc_model = pd.get_dummies(data.model_name, prefix='m')
    data = pd.concat([data, enc_model], axis=1)

    avail = set(data['availability'].str.upper())
    print(avail)
    avail = pd.DataFrame(avail)
    avail = avail.rename(columns={0: 'availability'})

    for av in avail.itertuples():
        data.loc[data['availability'].str.upper() == av.availability, 'availability'] = av.Index
        
    return data
