# Package Installation and Data Reading

In [6]:
import os
import numpy as np
import pandas as pd

In [67]:
def read_csv_files_to_dataframes(relative_path: str) -> dict:
    """
    Read CSV files from a specified relative path and store them in separate DataFrames.

    Args:
        relative_path (str): The relative path of the directory containing the CSV files.

    Returns:
        dict: A dictionary where keys are the file names (without extension) and values are the corresponding DataFrames.

    Raises:
        FileNotFoundError: If the specified relative path does not exist.

    Example:
        dataframes = read_csv_files_to_dataframes('data_folder')
        df1 = dataframes['file1']  # Access the DataFrame for file1.csv
        df2 = dataframes['file2']  # Access the DataFrame for file2.csv
    """
    data_folder = os.path.join(os.getcwd(), relative_path)

    if not os.path.exists(data_folder):
        raise FileNotFoundError(f"The specified relative path '{relative_path}' does not exist.")

    csv_files = [file for file in os.listdir(data_folder) if file.endswith('.csv')]
    dataframes = {}

    for file in csv_files:
        file_path = os.path.join(data_folder, file)
        df_name = os.path.splitext(file)[0]
        dataframes[df_name] = pd.read_csv(file_path)

 

    return dataframes 

In [68]:
relative_path = "/Users/alexholzer/Desktop/retail_price_prediction/data"
dataframes = read_csv_files_to_dataframes(relative_path)

In [86]:
# create a DataFrame for each file
for file_name in dataframes:
    locals()[file_name] = dataframes[file_name].copy()
    print(file_name)

file_out2
file_out


# Data Familiarization

In [105]:
file_out.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33356 entries, 0 to 33355
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  33356 non-null  int64  
 1   DocumentID  33356 non-null  int64  
 2   Date        33356 non-null  object 
 3   SKU         33356 non-null  int64  
 4   Price       33356 non-null  float64
 5   Discount    33356 non-null  float64
 6   Customer    33356 non-null  int64  
 7   Quantity    33356 non-null  float64
dtypes: float64(3), int64(4), object(1)
memory usage: 2.0+ MB


In [108]:
file_out2.info(memory_usage='True')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29103 entries, 0 to 29102
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  29103 non-null  int64  
 1   InvoiceID   29103 non-null  int64  
 2   Date        29103 non-null  object 
 3   ProductID   29103 non-null  int64  
 4   TotalSales  29103 non-null  float64
 5   Discount    29103 non-null  float64
 6   CustomerID  29103 non-null  int64  
 7   Quantity    29103 non-null  int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 1.8+ MB


In [111]:
column_mapping = {
    'Unnamed: 0': 'Unnamed: 0',
    'DocumentID': 'InvoiceID',
    'Date': 'Date',
    'SKU': 'ProductID',
    'Price': 'TotalSales',
    'Discount': 'Discount',
    'Customer': 'CustomerID',
    'Quantity': 'Quantity'
}

file_out.columns = [column_mapping[col] for col in file_out.columns]

In [116]:
combined_files = pd.concat([file_out,file_out2]).drop_duplicates(keep='last')

pandas.core.frame.DataFrame

Unnamed: 0.1,Unnamed: 0,InvoiceID,Date,ProductID,TotalSales,Discount,CustomerID,Quantity
0,0,716,2019-09-23,1039,381.780000,67.372540,1,1.0
1,1,716,2019-09-23,853,593.220000,0.000340,1,1.0
2,2,716,2019-09-23,862,423.730000,-0.001190,1,1.0
3,3,716,2019-09-23,868,201.700000,35.588140,1,1.0
4,4,716,2019-09-23,2313,345.760000,61.019660,1,1.0
...,...,...,...,...,...,...,...,...
29098,29098,11092,2023-01-13,1644,6573.000000,1183.140000,269,1.0
29099,29099,11093,2023-01-13,352,5179.728814,932.351186,250,4.0
29100,29100,11094,2023-01-13,683,7741.423729,1393.456271,415,4.0
29101,29101,11095,2023-01-14,1830,3644.067797,655.932203,59,4.0
