# Imports

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

# Request the website content and create bs object

In [2]:
r = requests.get("https://loczek.pl/")
soup = bs(r.content, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="pl-PL">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta class="foundation-mq"/>
  <meta content="pwWy_Jxr32AJXBuB0EhiMKD_88xf0ilaDuLpjpCAYvU" name="google-site-verification"/>
  <link href="https://loczek.pl/xmlrpc.php" rel="pingback"/>
  <title>
   Loczek.pl - sklep z naturalnymi kosmetykami do pielęgnacji włosów.
  </title>
  <link data-minify="1" href="https://loczek.pl/wp-content/cache/min/1/f4a2e9b11eedbdbe32511e3ec0e44938.css" media="all" rel="stylesheet"/>
  <meta content="Sklep internetowy z kosmetykami i akcesoriami do włosów kręconych oraz zwolenników świadomej pielęgnacji. Zaawansowana wyszukiwarka umożliwia dobranie odpowiednich kosmetyków do Twojego typu włosów." name="description"/>
  <meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="robots"/>
  <link href="ht

# Web scraping code

In [4]:
peh_categories = {"E":"Emolientowe", "P":"Proteinowe", "H":"Humektantowe"}
products_by_peh = {"E": [], "P": [], "H": []}

def get_product_details(prod_link, peh_code): 
    product_object = {}
    product_r =  requests.get(prod_link)
    product_soup = bs(product_r.content, 'lxml')

    inci_title = product_soup.find("h3", text="Skład produktu")
    if inci_title is not None:
        inci = []
        ingredients_ul = inci_title.parent.find("ul")
        if ingredients_ul is None:
            return
        
        for ingredient in ingredients_ul.find_all("li"):
            inci.append(ingredient.find("a").string)
        product_object["inci"] = inci
    else:
        return

    prod_title = product_soup.find('h1', {"class":"product_title entry-title"}).string
    prod_title_prep = prod_title.replace('\n', '').strip()
    product_object["title"]= prod_title_prep
    products_by_peh[peh_code].append(product_object)


def iterate_through_page_products(peh_soup, peh_code):
    products_list = peh_soup.find("ul", {"class":"products row wide"})
    print("Started iterating through products")
    for product in products_list.find_all("li"):
        product_link = product.find("a")["href"]
        get_product_details(product_link, peh_code)
    print("Finished iterating through products")

    
def iterate_through_pagination(peh_soup, peh_code):
    page_soup = peh_soup
    while True:
        iterate_through_page_products(page_soup, peh_code)
        
        next_page_a = page_soup.find("a", {"class":"next page-numbers"})
        
        if next_page_a is None:
            break
        
        next_link = next_page_a['href']
        page_r =  requests.get(next_link)
        page_soup = bs(page_r.content, 'lxml')
        
    
def handle_peh_links(peh_link, peh_code):
    peh_r =  requests.get(peh_link)
    peh_soup = bs(peh_r.content, 'lxml')
    pagination_list = peh_soup.find("ul", {"class":"page-numbers"})
    if pagination_list is not None:
        iterate_through_pagination(peh_soup, peh_code)
    else:
        iterate_through_page_products(peh_soup, peh_code)
    

for peh_code, peh_name in peh_categories.items():
    peh_link = soup.find("a", text=peh_name)['href']
    handle_peh_links(peh_link, peh_code)
    
print(products_by_peh["P"][0])
print(products_by_peh["E"][0])
print(products_by_peh["H"][0])

Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Started iterating through products
Finished iterating through products
Starte

In [5]:
products_by_peh

{'E': [{'inci': ['Water (Aqua/Eau)',
    'Polyquaternium-37',
    'Cyclopentasiloxane',
    'Cetearyl Alcohol',
    'VP/Dimethylaminoethylmethacrylate Copolymer',
    'Polyquaternium-11',
    'Propylene Glycol Dicaprylate/Dicaprate',
    'Fragrance (Parfum)',
    'Phenoxyethanol',
    'Citronellol',
    'Amyl Cinnamal',
    'Helianthus Annuus (Sunflower) Seed Extract',
    'Tocopherol',
    'Linalool',
    'C12-15 Alkyl Benzoate',
    'Sorbitan Oleate',
    'Panthenol',
    'Citral',
    'Helianthus Annuus (Sunflower) Seed Oil',
    'Bacopa Monniera Extract',
    'Butylphenyl\xa0Methylpropional',
    'Simmondsia Chinensis (Jojoba) Seed Oil',
    'PPG-1 Trideceth-6',
    'PEG-4',
    'Limonene',
    'Hydroxycitronellal',
    'Disodium Edta',
    'VP/DMAPA Acrylates Copolymer',
    'PEG-4 Laurate',
    'Glycerin',
    'Geraniol',
    'Benzyl salicylate',
    'iodopropynyl butylcarbamate',
    'Chitosan',
    'Hexyl Cinnamal',
    'Polysorbate-60',
    'Alcohol Denat.',
    'Benzalkonium 

In [6]:
products_by_peh.keys()

dict_keys(['E', 'P', 'H'])

# Convert data to df pandas object

In [7]:
df_e = pd.DataFrame(products_by_peh['E'])
df_e['peh_class'] = list(products_by_peh.keys())[0]
df_e

Unnamed: 0,inci,title,peh_class
0,"[Water (Aqua/Eau), Polyquaternium-37, Cyclopen...",Tigi Catwalk Curls Rock Amplifier – krem styli...,E
1,"[Aqua, Cetearyl Alcohol, Myristyl Alcohol, Cet...",DR. SANTE COCONUT HAIR Z OLEJEM KOKOSOWYM – ma...,E
2,[Helianthus Annuus (Sunflower) Seed Oil],ETJA NIERAFINOWANY ZIMNOTŁOCZONY OLEJ SŁONECZN...,E
3,[Argania Spinosa (Argan) Kernel Oil],ETJA NIERAFINOWANY ZIMNOTŁOCZONY OLEJ ARGANOWY...,E
4,"[Aqua, Cetearyl Alcohol, Butyrospermum Parkii ...",ANWEN Emolientowa Róża – odżywka do włosów o w...,E
...,...,...,...
164,[Urtica Dioica (Nettle) Seed Oil],Dary Natury – zimnotłoczony olej z nasion pokr...,E
165,[Fragaria Ananassa Seed Oil],Your Natural Side – aromatyczny olej z nasion ...,E
166,[Camellia Oleifera Seed Oil],"Your Natural Side – kojący, nierafinowany olej...",E
167,[Simmondsia Chinensis (Jojoba) Seed Oil],"Your Natural Side – nierafinowany, złoty olej ...",E


In [8]:
df_p = pd.DataFrame(products_by_peh['P'])
df_p['peh_class'] = list(products_by_peh.keys())[1]
df_p

Unnamed: 0,inci,title,peh_class
0,"[Aqua, Cetearyl Alcohol, Myristyl Alcohol, Cet...",DABUR VATIKA TROPIKALNY KOKOS – maska do włosó...,P
1,"[Aqua, Cetearyl Alcohol, Behentrimonium Chlori...",Anwen Mała Odżywka Proteinowa Zielona Herbata-...,P
2,"[Aqua, Cetearyl Alcohol, Behentrimonium Chlori...",ANWEN Proteinowa Zielona Herbata – odżywka do ...,P
3,"[Aqua, Cetearyl Alcohol, Behentrimonium Chlori...",Anwen MAŁA Odżywka Proteinowa Magnolia – do wł...,P
4,"[Aqua, Cetearyl Alcohol, Behentrimonium Chlori...",ANWEN Proteinowa Magnolia – odżywka do włosów ...,P
5,"[Aqua, Cetearyl Alcohol, Behentrimonium Chlori...",ANWEN Proteinowa Orchidea – odżywka do włosów ...,P
6,"[Aqua, Cetearyl Alcohol, Behentrimonium Chlori...",Anwen MAŁA Odżywka Proteinowa Orchidea – odżyw...,P


In [9]:
df_h = pd.DataFrame(products_by_peh['H'])
df_h['peh_class'] = list(products_by_peh.keys())[2]
df_h

Unnamed: 0,inci,title,peh_class
0,"[Panthenol, Aqua]",Calaya D-pantenol w roztworze wodnym 75% – naw...,H
1,"[Propylene Glycol, Aqua, Rubus Idaeus (Raspber...",Calaya – płynny ekstrakt z maliny,H
2,"[Aqua, Urea, Propylene Glycol, Glycerin, Biosa...",Cerkogel 30% – keratolityczny żel mocznikowy d...,H
3,"[Aqua (Water), Glycerin, Emblica Officinalis (...","Eco Laboratorium – serum do włosów, mgiełka, o...",H
4,[Alcea Rosea Nigra Flower],"Dary Natury – kwiat czarnej malwy, na kojące p...",H
...,...,...,...
122,"[Aloe Barbadensis (Aloe Vera) Leaf Juice, Nelu...","Holika Holika – wielofunkcyjny, nawilżający że...",H
123,"[Aqua, Sodium Hyaluronate, Benzyl Alcohol, Deh...","Calaya – nawilżający żel hialuronowy, 2% (50 ml)",H
124,"[Aqua, Sapindus Mukurossi Peel Floral Water, H...",FITOKOSMETIK DROŻDŻE PIWNE AKTYWATOR WZROSTU –...,H
125,"[Linum Usitatissimum (Linseed) Seed Extract, G...",JESSICURL Rockin’ Ringlets Styling Potion No F...,H


In [10]:
frames = [df_e, df_p, df_h]

df_npk_final = pd.concat(frames)
df_npk_final

Unnamed: 0,inci,title,peh_class
0,"[Water (Aqua/Eau), Polyquaternium-37, Cyclopen...",Tigi Catwalk Curls Rock Amplifier – krem styli...,E
1,"[Aqua, Cetearyl Alcohol, Myristyl Alcohol, Cet...",DR. SANTE COCONUT HAIR Z OLEJEM KOKOSOWYM – ma...,E
2,[Helianthus Annuus (Sunflower) Seed Oil],ETJA NIERAFINOWANY ZIMNOTŁOCZONY OLEJ SŁONECZN...,E
3,[Argania Spinosa (Argan) Kernel Oil],ETJA NIERAFINOWANY ZIMNOTŁOCZONY OLEJ ARGANOWY...,E
4,"[Aqua, Cetearyl Alcohol, Butyrospermum Parkii ...",ANWEN Emolientowa Róża – odżywka do włosów o w...,E
...,...,...,...
122,"[Aloe Barbadensis (Aloe Vera) Leaf Juice, Nelu...","Holika Holika – wielofunkcyjny, nawilżający że...",H
123,"[Aqua, Sodium Hyaluronate, Benzyl Alcohol, Deh...","Calaya – nawilżający żel hialuronowy, 2% (50 ml)",H
124,"[Aqua, Sapindus Mukurossi Peel Floral Water, H...",FITOKOSMETIK DROŻDŻE PIWNE AKTYWATOR WZROSTU –...,H
125,"[Linum Usitatissimum (Linseed) Seed Extract, G...",JESSICURL Rockin’ Ringlets Styling Potion No F...,H


# Save and check dataset

In [11]:
path_to_output = r'C:\Users\Pszczółka\Projects\Data_Science_Projects\PEH_Classifier\Data_collection'
df_npk_final.to_csv(path_to_output + r'\Hair_cosmetics_by_PEH_loczek.csv')

In [12]:
df_npk_final_test = pd.read_csv('Hair_cosmetics_by_PEH_loczek.csv')
df_npk_final_test

Unnamed: 0.1,Unnamed: 0,inci,title,peh_class
0,0,"['Water (Aqua/Eau)', 'Polyquaternium-37', 'Cyc...",Tigi Catwalk Curls Rock Amplifier – krem styli...,E
1,1,"['Aqua', 'Cetearyl Alcohol', 'Myristyl Alcohol...",DR. SANTE COCONUT HAIR Z OLEJEM KOKOSOWYM – ma...,E
2,2,['Helianthus Annuus (Sunflower) Seed Oil'],ETJA NIERAFINOWANY ZIMNOTŁOCZONY OLEJ SŁONECZN...,E
3,3,['Argania Spinosa (Argan) Kernel Oil'],ETJA NIERAFINOWANY ZIMNOTŁOCZONY OLEJ ARGANOWY...,E
4,4,"['Aqua', 'Cetearyl Alcohol', 'Butyrospermum Pa...",ANWEN Emolientowa Róża – odżywka do włosów o w...,E
...,...,...,...,...
298,122,"['Aloe Barbadensis (Aloe Vera) Leaf Juice', 'N...","Holika Holika – wielofunkcyjny, nawilżający że...",H
299,123,"['Aqua', 'Sodium Hyaluronate', 'Benzyl Alcohol...","Calaya – nawilżający żel hialuronowy, 2% (50 ml)",H
300,124,"['Aqua', 'Sapindus Mukurossi Peel Floral Water...",FITOKOSMETIK DROŻDŻE PIWNE AKTYWATOR WZROSTU –...,H
301,125,"['Linum Usitatissimum (Linseed) Seed Extract',...",JESSICURL Rockin’ Ringlets Styling Potion No F...,H
