In [3]:
import pandas  as pd
import datetime as dt
import openpyxl

cp_path = 'files/ceny_produktow.csv'
result = 'result.xlsx'
dims_path = 'files/dims.xlsx'

df1 = pd.read_csv(cp_path, sep=';', decimal='.')

df = df1.copy()

# change column names to English    
cols_dict = {
    'Nazwa': 'province',
    'Rodzaje towarów': 'product_types',
    'Jednostka miary': 'currency',
    'ID grupy': 'group_id',
    'Rodzaje produktów': 'product_line',
    'Wartosc': 'value',
    'Data': 'date'
}
df = df.rename(columns=cols_dict)

# looking for too old/too fresh dates 
dates = df['date'].unique()
dates = list(dates)
dates.sort()
#print(dates)

# finding typos in the product_types column
products = df['product_types'].unique()
#print(products)

# deleting incorrect data
df.loc[df['date']<'1889-12','date'] = '1999-1'
df.loc[df['date']>'2019-1','date'] = '2019-1'
df.loc[(df['product_types'] == "jaja kóże śfierze - za 666szt."),'product_types'] = "jaja kurze świeże - za 10szt."

# searching and deleting records with zero and too high values in 'value' column
values = df['value'].unique()
values = list(values)
values.sort()
# print(values)
df = df.loc[(df['value'] != 3000)& (df['value'] != 0)]

# Conversion of records from value given in EUR to PLN at the exchange rate of 4.15
df.loc[(df['currency'] == 'EUR'),'value'] = df['value'] * 4.15
df.loc[df['currency'] == 'EUR','currency'] = "zł"

# combine the values of the product_groups and product_line columns into one column
df['product'] = df['product_types']
df.loc[df['product_line'].notna(), 'product'] = df['product_line']

# remove two unnecessary columns
df = df.drop('product_line', axis=1)
df = df.drop('product_types', axis=1)

# removing duplicates
df = df.drop_duplicates(keep=False)

# normalization of the data set
d_province = pd.read_excel(dims_path, sheet_name = "d_province")
d_product = pd.read_excel(dims_path, sheet_name = "d_product")
d_product_group = pd.read_excel(dims_path, sheet_name = "d_product_group")

df = df.merge(d_province[['province','province_id']], on="province", how="left")
df = df.merge(d_product[['product','product_id']], on="product", how="left")

# date formatting
df['date_dt'] = pd.to_datetime(df['date'])
df['month'] = df['date_dt'].dt.month 
df['quarter'] = df['date_dt'].dt.quarter 
df['year'] = df['date_dt'].dt.year 
df['date'] = df['date_dt'].dt.strftime('%Y-%m-01')

df = df.drop('date_dt', axis=1)

df.to_excel(result, engine='openpyxl', index=False)