# Import Library

In [226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [227]:
from os import listdir
from os.path import isfile, join

In [228]:
import re

# Testing

In [12]:
df = pd.read_csv("../Dataset/2_Korea_Bakery_Price.csv")

In [13]:
df

Unnamed: 0,Name,price
0,angbutter,4800
1,plain bread,3500
2,jam,1500
3,ice coffe,4000
4,croissant,3500
5,ice coffe latter,4500
6,tiramisu croissant,4800
7,cacao deep,4000
8,pain au chocolat,3500
9,almond croissant,4000


# Load Data

In [229]:
listDataPretrainedModel = []

In [230]:
def addDataPretrainedModel(path):
    try:
        df = pd.read_excel(path)
    except Exception:
        df = pd.read_csv(path)
    listDataPretrainedModel.append(df)

In [231]:
onlyfiles = [f for f in listdir("../Dataset") if isfile(join("../Dataset", f))]

In [232]:
onlyfiles

['1 Coffe_Sales.csv',
 '10 Europe_Bike_Store_Sales.csv',
 '11 E-commerce_Business_Transaction.csv',
 '12 Liquor_Sales.csv',
 '2 Coffee_Bean_Sales.csv',
 '3 Balaji_Fast_Food_Sales.csv',
 '4 Pizza_Sales.csv',
 '5 Coffee_Shop_Sales.xlsx',
 '6 French_Bakery_Daily_Sales.csv',
 '7 Mexico_Toy_Sales.csv',
 '8 Online_Retail_Sales.csv',
 '9 Electronic_Sales.csv']

In [233]:
for file in onlyfiles:
    pathFile = join('../Dataset/', file)
    addDataPretrainedModel(path=pathFile)

In [234]:
listDataPretrainedModel[1]

Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
0,2013-11-26,26,November,2013,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
1,2015-11-26,26,November,2015,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
2,2014-03-23,23,March,2014,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1366,1035,2401
3,2016-03-23,23,March,2016,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,1188,900,2088
4,2014-05-15,15,May,2014,47,Adults (35-64),F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,238,180,418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113031,2016-04-12,12,April,2016,41,Adults (35-64),M,United Kingdom,England,Clothing,Vests,"Classic Vest, S",3,24,64,112,72,184
113032,2014-04-02,2,April,2014,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113033,2016-04-02,2,April,2016,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113034,2014-03-04,4,March,2014,37,Adults (35-64),F,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",24,24,64,684,576,1260


# Preprocessing

In [235]:
import copy

listTest = copy.deepcopy(listDataPretrainedModel)

## Date

In [236]:
def clean_column_names(df):
    def clean_name(name):
        name = name.lower()
        name = re.sub(r'[^\w]', '', name)
        name = name.replace(' ', '')
        name = name.replace('_', '')
        return name
    
    df.columns = [clean_name(col) for col in df.columns]
    return df

In [237]:
listTest = [clean_column_names(df) for df in listTest]

In [238]:
def standardize_date_columns(df):    
    date_columns = [col for col in df.columns if 'date' in col]
    
    if not date_columns:
        raise ValueError("Dataset tidak memiliki kolom yang mengandung kata 'date'.")

    standardized_dates = []

    for col in date_columns:
        df[col] = df[col].astype(str) 
        if (df[col].str.contains('/').any() and df[col].str.contains('-').any()):
            df[col] = df[col].str.replace('-', '/')
        try:
            df[col] = pd.to_datetime(df[col]) 
        except Exception:
            try:
                df[col] = pd.to_datetime(df[col], format="%d/%m/%Y")
            except Exception:
                try:
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                except Exception:
                    continue
        
        standardized_dates.append(df[col])
        
        # endfor

    df.drop(columns=date_columns, inplace=True)
    
    df['date'] = pd.concat(standardized_dates, axis=1).bfill(axis=1).iloc[:, 0]
    return df



In [239]:
listTest = [standardize_date_columns(df) for df in listTest]

## Product

In [255]:
listTest[4]

Unnamed: 0,unnamed0,orderid,customerid,productid,quantity,customername,email,country,coffeetypex,roasttypex,sizex,sales,coffeetypey,roasttypey,sizey,unitprice,profit,date
0,0,QEV-37451-860,17670-51384-MA,R-M-1,2,,,,,,,,Rob,M,1.0,9.950,0.59700,2019-09-05
1,1,QEV-37451-860,17670-51384-MA,E-M-0.5,5,,,,,,,,Exc,M,0.5,8.250,0.90750,2019-09-05
2,2,FAA-43335-268,21125-22134-PX,A-L-1,1,,,,,,,,Ara,L,1.0,12.950,1.16550,2021-06-17
3,3,KAC-83089-793,23806-46781-OU,E-M-1,2,,,,,,,,Exc,M,1.0,13.750,1.51250,2021-07-15
4,4,KAC-83089-793,23806-46781-OU,R-L-2.5,2,,,,,,,,Rob,L,2.5,27.485,1.64910,2021-07-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,RLM-96511-467,43014-53743-XK,R-L-2.5,1,,,,,,,,Rob,L,2.5,27.485,1.64910,2020-01-06
996,996,AEZ-13242-456,62494-09113-RP,R-M-0.5,5,,,,,,,,Rob,M,0.5,5.970,0.35820,2022-05-04
997,997,UME-75640-698,62494-09113-RP,A-M-0.5,4,,,,,,,,Ara,M,0.5,6.750,0.60750,2019-10-11
998,998,GJC-66474-557,64965-78386-MY,A-D-1,1,,,,,,,,Ara,D,1.0,9.950,0.89550,2021-08-03
