In [103]:
import pandas as pd
import numpy as np
from datetime import datetime

In [104]:
# Load raw data
df = pd.read_csv("notino_raw.csv")

In [105]:
# Extract the percentage of discount
df["Discount Percentage"] = df["Discount"].str.extract(r'-(.*?)%', expand=False).str.strip()

In [106]:
# Extract the description of discount
df["Discount Description"] = df["Discount"].str.extract(r'%\s*(.*)', expand=False).str.strip()

In [107]:
# Drop discount column
df.drop(columns=['Discount'], inplace=True)

In [108]:
# Change datatypes
df['Discount Percentage'].replace(np.NaN, 0, inplace=True)
df["Price"].replace(np.NaN, 0, inplace=True)
df['Discount Percentage'] = pd.to_numeric(df['Discount Percentage'], errors='coerce').astype('Int64')
df['Price'] = pd.to_numeric(df['Price'], errors='coerce').astype('Int64')

In [109]:
# Calculate discount amount
df["Discount Amount"] = df["Price"] * df["Discount Percentage"] / 100 

In [110]:
# Add new columns
df["Country"] = "Czechia"
df["Currency"] = "czk"
df["Scraped at"] = datetime.now()

In [111]:
df

Unnamed: 0,Brand,Name,Description,Price,Product URL,Image,Discount Percentage,Discount Description,Discount Amount,Country,Currency,Scraped at
0,Opalescence,Whitening,bělicí zubní pasta s fluoridem,99,https://www.notino.cz/opalescence/original-for...,https://cdn.notinoimg.com/list_2k//opalescence...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
1,Opalescence,Whitening Sensitivity Relief,bělicí pasta pro citlivé zuby,96,https://www.notino.cz/opalescence/sensitivity-...,https://cdn.notinoimg.com/list_2k//opalescence...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
2,GC,Tooth Mousse,remineralizační ochranný krém pro citlivé zuby...,437,https://www.notino.cz/gc/tooth-mousse-tutti-fr...,https://cdn.notinoimg.com/list_2k//gc/28000119...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
3,MEDIBLANC,Whitening,zubní pasta s bělicím účinkem,193,https://www.notino.cz/mediblanc/whitening-zubn...,https://cdn.notinoimg.com/list_2k//mediblanc/3...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
4,Elmex,Junior 6-12 Years,zubní pasta pro děti,94,https://www.notino.cz/elmex/junior-6-12-years-...,https://cdn.notinoimg.com/list_2k//elmex/90030...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
...,...,...,...,...,...,...,...,...,...,...,...,...
549,Chicco,Always Smiling Green,sada zubní péče Strawberry(pro děti),0,https://www.notino.cz/chicco/always-smiling-gr...,https://cdn.notinoimg.com/list_2k//chicco/8058...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
550,Biorepair,Advanced Senitivity,zubní pasta pro citlivé zuby pro ochranu zubů ...,0,https://www.notino.cz/biorepair/advanced-senit...,https://cdn.notinoimg.com/list_2k//biorepair/8...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
551,Colgate,Whitening,bělicí zubní pasta 75 ml,0,https://www.notino.cz/colgate/whitening-belici...,https://cdn.notinoimg.com/list_2k//colgate/692...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655
552,BlanX,White Shock Instant White,bělicí zubní pasta 75 ml,119,https://www.notino.cz/blanx/white-shock-belici...,https://cdn.notinoimg.com/list_2k//blanx/80173...,0,,0.0,Czechia,czk,2024-05-23 17:54:03.358655


In [112]:
# Save transformed DataFrame
df.to_csv("notino_transformed.csv", index=False)