# Importing libraries

In [6]:
import pandas as pd
import time
from datetime import datetime
import ydata_profiling
from pandas_profiling import ProfileReport
import numpy as np
import re
import requests

# Reading CSV

In [7]:
df = pd.read_csv('scraped_mediamarkt.csv')

# DF Report

In [8]:
profile = ProfileReport(df, title='Data Profiling Report', explorative=True)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Pandas

In [9]:
#overview
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363 entries, 0 to 362
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   page            363 non-null    int64  
 1   brand           363 non-null    object 
 2   model           363 non-null    object 
 3   category        363 non-null    object 
 4   size            353 non-null    float64
 5   storage         355 non-null    object 
 6   color           363 non-null    object 
 7   price           363 non-null    object 
 8   source          363 non-null    object 
 9   condition       363 non-null    object 
 10  date            363 non-null    object 
 11  article_number  333 non-null    float64
 12  n_of_reviews    333 non-null    object 
 13  rating          224 non-null    float64
 14  delivery_time   333 non-null    object 
dtypes: float64(3), int64(1), object(11)
memory usage: 42.7+ KB


Unnamed: 0,page,size,article_number,rating
count,363.0,353.0,333.0,224.0
mean,8.066116,6.448246,2176778.0,4.475446
std,4.369158,0.428432,51275.19,0.73208
min,1.0,4.7,1971824.0,1.0
25%,4.0,6.1,2151043.0,4.4
50%,8.0,6.6,2198411.0,4.7
75%,12.0,6.7,2216489.0,5.0
max,16.0,7.6,2242341.0,5.0


### Looking for duplicates

In [10]:
df.duplicated().sum()  # Find number of duplicates

0

### Missing Values

In [11]:
df.isnull().sum() 

page                0
brand               0
model               0
category            0
size               10
storage             8
color               0
price               0
source              0
condition           0
date                0
article_number     30
n_of_reviews       30
rating            139
delivery_time      30
dtype: int64

### Unique values

In [12]:
df.nunique()

page               16
brand              17
model             111
category            3
size               33
storage             8
color             124
price             208
source            363
condition           1
date                1
article_number    333
n_of_reviews       56
rating             20
delivery_time      10
dtype: int64

# Manipulating Target Variabeles

Target variables are the columns that are important for the end structure

### Brand

In [13]:
df['brand'].unique()

array(['APPLE', 'XIAOMI', 'SAMSUNG', 'NOTHING', 'MOTOROLA', 'GOOGLE',
       'DORO', 'FAIRPHONE', 'INOI', 'EMPORIA', 'ONE', 'NOKIA', 'RUGGEAR',
       'OPPO', 'CROSSCALL', 'WIKO', 'PEAQ'], dtype=object)

In [14]:
df['brand'] = df['brand'].str.lower()

In [15]:
df['brand'].isnull().sum() 

0

In [16]:
valid_brands = ['apple', 'xiaomi', 'samsung', 'nothing', 'motorola', 'fairphone', 
                'google', 'doro', 'inoi', 'emporia', 'one', 'nokia', 'ruggear', 
                'oppo', 'crosscall', 'wiko', 'peaq', 'huawei', 'lg', 'sony', 'htc', 
                'oneplus', 'zte', 'alcatel', 'asus', 'blackberry', 'realme', 'vivo', 
                'tecno', 'lenovo', 'meizu', 'honor', 'ulefone', 'cat']

In [17]:
def validate_brand(brand):
    if brand not in valid_brands:
        print(f"Unrecognized brand, please verify: {brand}")

# Apply the function to the 'brand' column
df['brand'].apply(validate_brand)

0      None
1      None
2      None
3      None
4      None
       ... 
358    None
359    None
360    None
361    None
362    None
Name: brand, Length: 363, dtype: object

### Model

In [18]:
df['model'].unique()

array(['iPhone 15 ', 'iPhone 15 Pro Max ', 'iPhone 14 ', 'iPhone 15 Pro ',
       'iPhone 11 (2020) ', 'Redmi A2 ', 'iPhone 13 ', 'Redmi 13C ',
       'Galaxy A15 4G ', 'iPhone SE ', 'Galaxy A55 5G ', 'Galaxy S23+ ',
       'phone (2a) ', 'Galaxy S23 ', 'Galaxy Z Flip5 ',
       'Redmi Note 13 5G ', 'iPhone SE (2022) ', 'Galaxy A54 5G ',
       'Redmi Note 13 Pro+ 5G ', 'Redmi 9A ', 'Galaxy A34 5G ',
       'iPhone 14 Plus ', 'phone (2) ', 'Galaxy S24+ ',
       'Galaxy S24 Ultra ', 'Galaxy S23 Ultra ', 'Galaxy A25 5G ',
       'Redmi Note 13 4G ', 'iPhone 15 Plus ', 'iPhone 12 ', 'Moto G13 ',
       'Moto G54 5G ', 'Pixel 8 Pro ', 'XCover6 Pro Enterprise Edition ',
       'Galaxy A35 5G ',
       '14 + Electric Scooter 4 Lite Swiss Edition Bundle ', '6820 ',
       '5 5G ', 'Galaxy XCover 5 Enterprise Edition ', 'Galaxy S24 ',
       'Note 13s ', 'Redmi 12 ', 'SUPEReasy ', 'Galaxy Z Fold5 ',
       'PLUS Nord CE 3 Lite 5G ', 'TOUCHsmart.2 ', '13T ',
       'Galaxy S23 FE ', 'Moto E13 

In [19]:
df['model'] = df['model'].str.lower()

In [20]:
def strip(model):
    return model.rstrip()

df['model'] = df['model'].apply(strip)

In [21]:
df['model'].isnull().sum() 

0

### Category

In [22]:
# Only smartphones are the interest of the scrapping
df['category'].unique()

array(['Smartphone', 'Klapphandy', 'Mobiltelefon'], dtype=object)

In [23]:
# Filtering unecessary categories

In [24]:
df = df[df['category'] == 'Smartphone']

In [25]:
df['category'] = df['category'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = df['category'].str.lower()


In [26]:
df['category'].isnull().sum() 

0

### Size

In [27]:
df['size'].unique()

array([6.1  , 6.7  , 6.52 , 6.74 , 6.5  , 4.7  , 6.6  , 6.67 , 6.4  ,
       6.53 , 6.8  , 6.36 , 6.46 , 5.3  , 6.2  , 6.95 , 6.79 ,   nan,
       7.6  , 6.72 , 6.71 , 6.9  , 6.56 , 6.73 , 6.497, 4.95 , 6.517,
       6.3  , 5.45 , 6.43 , 6.55 , 6.58 , 6.08 , 6.39 ])

In [28]:
df_size_nan = df[np.isnan(df['size'])]
print(len(df_size_nan))
df_size_nan

4


Unnamed: 0,page,brand,model,category,size,storage,color,price,source,condition,date,article_number,n_of_reviews,rating,delivery_time
129,6,emporia,supereasy,smartphone,,,EMPORIA SUPEReasy - Smartphone (Schwarz,209.7,https://www.mediamarkt.ch/de/product/_emporia-...,new,2024-04-07,2127635.0,(0),,"Bestellbar, Auslieferung in 1-2 Werktagen"
138,6,emporia,touchsmart.2,smartphone,,,EMPORIA TOUCHsmart.2 - Smartphone (Schwarz/Silber,118.95,https://www.mediamarkt.ch/de/product/_emporia-...,new,2024-04-07,2110132.0,(7),2.6,Auslieferung in 1-2 Werktagen
188,8,ruggear,rg360,smartphone,,8 GB,Schwarz,122.95,https://www.mediamarkt.ch/de/product/_ruggear-...,new,2024-04-07,2053578.0,(0),,Auslieferung in bis zu 7 Werktagen
251,11,wiko,y52,smartphone,,16 GB,Deep Blue,49.7,https://www.mediamarkt.ch/de/product/_wiko-y52...,new,2024-04-07,2127285.0,(2),1.5,Der Artikel ist nicht mehr verfügbar. Prüfen S...


In [29]:
size_verification = df_size_nan['source']
for index, url in size_verification.items():
    print(f"{index}: {url}")

129: https://www.mediamarkt.ch/de/product/_emporia-supereasy-2127635.html
138: https://www.mediamarkt.ch/de/product/_emporia-touchsmart-2-2110132.html
188: https://www.mediamarkt.ch/de/product/_ruggear-rg360-2053578.html
251: https://www.mediamarkt.ch/de/product/_wiko-y52-2127285.html


In [30]:
# Adding manually missing data, through phone link

In [31]:
size_manually = {
    131: 5.00,
    142: 3.25,
    189: 3.00,
    254: 5.00
}

# Update the 'size' column in the DataFrame
for index, size in size_manually.items():
    df.at[index, 'size'] = size

In [32]:
df['size'].isnull().sum() 

4

### Storage

In [33]:
df['storage'].unique()

array(['128 GB', '256 GB', '64 GB', '32 GB', '512 GB', '1 TB', nan,
       '8 GB', '16 GB'], dtype=object)

In [34]:
df_space_nan = df[pd.isnull(df['storage'])]
storage_verification = df_space_nan['source']
for index, url in storage_verification.items():
    print(f"{index}: {url}")

129: https://www.mediamarkt.ch/de/product/_emporia-supereasy-2127635.html
138: https://www.mediamarkt.ch/de/product/_emporia-touchsmart-2-2110132.html


In [35]:
storage_manually = {
    131: '32 GB',
    142: '8 GB'
}

# Update the 'size' column in the DataFrame
for index, storage in storage_manually.items():
    df.at[index, 'storage'] = storage

### Color

In [36]:
df['color'].unique()

array(['Black', 'Black Titanium', 'Midnight', 'Natural Titanium', 'Blue',
       'Purple', 'Schwarz', 'Starlight', 'Midnight Black', 'Yellow',
       'White Titanium', 'Blue Titanium', 'Blue Black', 'Pink',
       'Awesome Lilac', 'Phantom Black', 'Weiss', 'Awesome Navy',
       'Graphite', 'Graphite Black', 'Awesome Lime', 'Green',
       'Granite Grey', 'Awesome Iceblue', 'Cream', 'Awesome Graphite',
       'Arctic White', 'Grau', 'Onyx Black', 'Titanium Black',
       'Awesome Lemon', 'Clover Green', 'Hellgrün', 'Matte Charcoal',
       'Mint Green', 'Midnight Blue', 'Obsidian', 'Titanium Violet',
       'Awesome Violet', 'Jade Green', 'Transparent Edition',
       '64 GB, (PRODUCT)RED', 'Cobalt Violet', 'Titanium Yellow',
       'Titanium Grey', 'Marble Grey', 'Lavender',
       'EMPORIA SUPEReasy - Smartphone (Schwarz', 'Awesome White',
       'Icy Blue', 'Chromatic Gray', 'Navy Blue',
       'EMPORIA TOUCHsmart.2 - Smartphone (Schwarz/Silber',
       'Awesome Silver', 'Cosmic Bla

In [37]:
colors = ['black', 'blue', 'green', 'red', 'yellow', 'white', 'gray', 'purple', 'pink', 'orange', 'brown', 'silver', 'gold', 'titanium', 'platinum', 'schwarz', 'weiss']

def extract_color(value):
    # Check if the string contains numbers, 'GB', or specific special characters
    if re.search(r'\d|GB|[()/]', value):
        # Convert the value to lowercase to make the search case-insensitive
        value_lower = value.lower()

        # Search for each color in the string
        for color in colors:
            if re.search(r'\b' + color + r'\b', value_lower):
                return color.capitalize()  # Return the color with the first letter capitalized

        # Return 'Unknown' or any other placeholder if no known color is found
        return 'Unknown'
    
    # If the string doesn't contain the specified patterns, return it as is
    return value

In [38]:
df['color'] = df['color'].apply(extract_color)
df['color'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['color'] = df['color'].apply(extract_color)


array(['Black', 'Black Titanium', 'Midnight', 'Natural Titanium', 'Blue',
       'Purple', 'Schwarz', 'Starlight', 'Midnight Black', 'Yellow',
       'White Titanium', 'Blue Titanium', 'Blue Black', 'Pink',
       'Awesome Lilac', 'Phantom Black', 'Weiss', 'Awesome Navy',
       'Graphite', 'Graphite Black', 'Awesome Lime', 'Green',
       'Granite Grey', 'Awesome Iceblue', 'Cream', 'Awesome Graphite',
       'Arctic White', 'Grau', 'Onyx Black', 'Titanium Black',
       'Awesome Lemon', 'Clover Green', 'Hellgrün', 'Matte Charcoal',
       'Mint Green', 'Midnight Blue', 'Obsidian', 'Titanium Violet',
       'Awesome Violet', 'Jade Green', 'Transparent Edition', 'Red',
       'Cobalt Violet', 'Titanium Yellow', 'Titanium Grey', 'Marble Grey',
       'Lavender', 'Awesome White', 'Icy Blue', 'Chromatic Gray',
       'Navy Blue', 'Awesome Silver', 'Cosmic Black', 'Ocean Teal',
       'Aurora Purple', 'Amber Yellow', 'Dark Pearl', 'Mint',
       'Meteor Grey', 'Alpine Blue', 'Hellblau', 'De

In [39]:
df_color_nan = df[pd.isnull(df['color'])]

### Price

In [40]:
df['price'].unique()

array(['716.', '1129.', '619.', '1040.', '1125.', '728.', '349.', '816.',
       '84.95', '529.', '118.95', '711.', '399.', '1031.', '1047.',
       '179.95', '939.', '1137.', '430.95', '829.', '169.95', '719.',
       '479.', '729.', '329.95', '598.', '379.95', '844.', '254.95',
       '383.95', '299.', '413.95', '932.', '948.', '1145.', '1361.',
       '429.', '79.95', '629.', '1375.', '1329.', '694.', '1118.',
       '320.95', '819.', '749.', '549.', '920.', '1232.', '915.', '1399.',
       '1299.-1199.', '899.', '133.95', '259.95', '1236.', '479.95',
       '152.95', '88.95', '129.95', '743.', '381.95', '172.95', '921.',
       '419.95', '987.', '359.', '439.95', '1099.', '595.', '1546.',
       '206.', '779.', '869.', '1244.', '661.', '708.', '1149.', '1599.',
       '1169.', '949.', '1556.', '158.95', '144.95', '1563.', '929.',
       '209.70', '419.', '1778.', '222.95', '313.95', '369.70', '551.',
       '620.', '1026.', '99.95', '739.', '296.95', '356.95', '349.95',
       '399

In [41]:
def clean_price(value):
    # Find all numeric sequences
    matches = re.findall(r'\d+\.?\d*', value)
    if matches:
        # Return the last match
        return matches[-1]
    return value 

df['price'] = df['price'].apply(clean_price)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price'] = df['price'].apply(clean_price)


In [42]:
df['price'] = df['price'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price'] = df['price'].astype(float)


### Source

In [43]:
df_source_nan = df[pd.isnull(df['source'])]


In [46]:
"""
def check_url_status(url):
    try:
        response = requests.head(url, timeout=2)  # Using HEAD instead of GET to speed up the process
        if response.status_code == 200:
            return 'Working'
        else:
            return f'Broken ({response.status_code})'
    except requests.RequestException as e:
        return f'Error ({e})'

# Apply the function to check each URL
df['status'] = df['source'].apply(check_url_status)
"""

"\ndef check_url_status(url):\n    try:\n        response = requests.head(url, timeout=2)  # Using HEAD instead of GET to speed up the process\n        if response.status_code == 200:\n            return 'Working'\n        else:\n            return f'Broken ({response.status_code})'\n    except requests.RequestException as e:\n        return f'Error ({e})'\n\n# Apply the function to check each URL\ndf['status'] = df['source'].apply(check_url_status)\n"

### Condition

In [47]:
# no necessary manipulation

### Date

In [48]:
df['date'] = pd.to_datetime(df['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])


### n_of_reviews

In [49]:
df['n_of_reviews'].unique()

array(['(25)', '(13)', '(12)', '(14)', '(32)', '(8)', '(67)', '(4)',
       '(10)', '(34)', '(3)', '(0)', '(50)', '(1)', '(11)', '(24)', '(9)',
       '(17)', '(7)', '(155)', '(271)', '(57)', '(310)', '(28)', '(15)',
       '(6)', '(2)', '(5)', '(273)', '(56)', nan, '(18)', '(184)',
       '(787)', '(607)', '(19)', '(21)', '(786)', '(613)', '(309)',
       '(154)', '(274)', '(270)', '(185)', '(22)', '(272)', '(308)',
       '(49)', '(210)', '(605)', '(55)', '(275)', '(16)', '(26)', '(148)',
       '(27)', '(44)'], dtype=object)

In [50]:
df_n_of_reviews_nan = df[pd.isnull(df['n_of_reviews'])]
len(df_n_of_reviews_nan)

30

In [51]:
n_of_reviews_nan_verification = df_n_of_reviews_nan['source']
for index, url in n_of_reviews_nan_verification.items():
    print(f"{index}: {url}")

56: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a34-5g-2173976.html
70: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a54-5g-2173981.html
164: https://www.mediamarkt.ch/de/product/_samsung-galaxy-s23-2167340.html
165: https://www.mediamarkt.ch/de/product/_xiaomi-redmi-12c-2178005.html
169: https://www.mediamarkt.ch/de/product/_samsung-galaxy-s23-2167378.html
171: https://www.mediamarkt.ch/de/product/_samsung-galaxy-z-fold5-2193040.html
175: https://www.mediamarkt.ch/de/product/_samsung-galaxy-z-fold5-2193041.html
177: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a54-5g-2173982.html
203: https://www.mediamarkt.ch/de/product/_motorola-razr-40-ultra-2189432.html
224: https://www.mediamarkt.ch/de/product/_xiaomi-redmi-note-12-4g-2195956.html
228: https://www.mediamarkt.ch/de/product/_samsung-galaxy-s23-2167349.html
231: https://www.mediamarkt.ch/de/product/_samsung-galaxy-s23-ultra-2167388.html
232: https://www.mediamarkt.ch/de/product/_motorola-edge-20-pro-2095383.

In [52]:
reviews_manually = {54: '(5)'}
for index, reviews in reviews_manually.items():
    df.at[index, 'n_of_reviews'] = reviews

In [53]:
df['n_of_reviews'] = df['n_of_reviews'].str.replace('(', '', regex=False).str.replace(')', '', regex=False)
df['n_of_reviews'] = df['n_of_reviews'].fillna('0').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['n_of_reviews'] = df['n_of_reviews'].str.replace('(', '', regex=False).str.replace(')', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['n_of_reviews'] = df['n_of_reviews'].fillna('0').astype(int)


### Rating

In [54]:
df['rating'].unique()

array([4.8, 4.9, 4.6, 5. , 4.7, 4.1, nan, 4.5, 4.3, 4.4, 4. , 4.2, 3.7,
       3.3, 3. , 2.6, 1. , 1.5, 2. , 3.5, 3.4])

In [55]:
df_rating_nan = df[pd.isnull(df['rating'])]
len(df_rating_nan)
df_rating_nan

Unnamed: 0,page,brand,model,category,size,storage,color,price,source,condition,date,article_number,n_of_reviews,rating,delivery_time
12,1,apple,iphone 15,smartphone,6.10,128 GB,Yellow,711.00,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203373.0,0,,Das Produkt ist online ausverkauft.
17,1,samsung,galaxy a15 4g,smartphone,6.50,128 GB,Blue,179.95,https://www.mediamarkt.ch/de/product/_samsung-...,new,2024-04-07,2221377.0,0,,Auslieferung in 1-4 Werktagen
23,1,apple,iphone 15,smartphone,6.10,256 GB,Yellow,816.00,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203379.0,0,,Auslieferung in 1-2 Werktagen
24,2,samsung,galaxy a15 4g,smartphone,6.50,128 GB,Blue Black,169.95,https://www.mediamarkt.ch/de/product/_samsung-...,new,2024-04-07,2221378.0,0,,Auslieferung in 1-4 Werktagen
28,2,nothing,phone (2a),smartphone,6.70,128 GB,Weiss,329.95,https://www.mediamarkt.ch/de/product/_nothing-...,new,2024-04-07,2223932.0,0,,Wir bestellen für Sie den Artikel direkt beim ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,15,xiaomi,poco m5s,smartphone,6.43,64 GB,Weiss,206.95,https://www.mediamarkt.ch/de/product/_xiaomi-p...,new,2024-04-07,2152695.0,0,,Auslieferung in bis zu 7 Werktagen
354,15,fairphone,4 5g,smartphone,6.30,256 GB,Grau,515.00,https://www.mediamarkt.ch/de/product/_fairphon...,new,2024-04-07,2093553.0,0,,Auslieferung in 1-2 Werktagen
356,15,apple,iphone 15 plus,smartphone,6.70,512 GB,Pink,1173.00,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203399.0,0,,Wir bestellen für Sie den Artikel direkt beim ...
357,15,xiaomi,redmi note 12 pro 5g,smartphone,6.67,128 GB,Midnight Black,239.70,https://www.mediamarkt.ch/de/product/_xiaomi-r...,new,2024-04-07,,0,,


In [56]:
rating_nan_verification = df_rating_nan['source']
for index, url in rating_nan_verification.items():
    print(f"{index}: {url}")

12: https://www.mediamarkt.ch/de/product/_apple-iphone-15-2203373.html
17: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a15-4g-2221377.html
23: https://www.mediamarkt.ch/de/product/_apple-iphone-15-2203379.html
24: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a15-4g-2221378.html
28: https://www.mediamarkt.ch/de/product/_nothing-phone-2a-2223932.html
31: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a55-5g-2237090.html
37: https://www.mediamarkt.ch/de/product/_nothing-phone-2a-2223930.html
42: https://www.mediamarkt.ch/de/product/_apple-iphone-13-2118876.html
48: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a55-5g-2237050.html
52: https://www.mediamarkt.ch/de/product/_apple-iphone-15-pro-max-2203432.html
54: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a55-5g-2237093.html
56: https://www.mediamarkt.ch/de/product/_samsung-galaxy-a34-5g-2173976.html
63: https://www.mediamarkt.ch/de/product/_apple-iphone-15-pro-2203414.html
66: https://www.mediamarkt.

In [57]:
df['rating'] = df['rating'].fillna('0').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating'] = df['rating'].fillna('0').astype(float)


### Delivery Time

In [58]:
df['delivery_time'].unique()

array(['Auslieferung in 1-2 Werktagen',
       'Bestellbar, Auslieferung in 1-2 Werktagen',
       'Auslieferung in 1-4 Werktagen',
       'Auslieferung in bis zu 6 Werktagen',
       'Das Produkt ist online ausverkauft.',
       'Wir bestellen für Sie den Artikel direkt beim Lieferanten. Sobald der Artikel verfügbar ist, werden Sie per E-Mail informiert.',
       'Auslieferung in bis zu 7 Werktagen',
       'Aktuell nicht lieferbar und kein Liefertermin vorhanden. Alle Angaben ohne Gewähr.',
       nan, 'Auslieferung in bis zu 10 Werktagen',
       'Der Artikel ist nicht mehr verfügbar. Prüfen Sie die Marktverfügbarkeit'],
      dtype=object)

In [70]:
def extract_days(text):
    
    if pd.isnull(text) or "nicht mehr verfügbar" in text or "nicht lieferbar" in text or "ausverkauft" in text or "kein Liefertermin" in text:
        return None
    else:
        # Find all numbers in the string
        numbers = [int(num) for num in re.findall(r'\d+', text)]
        if numbers:
            return max(numbers)  # Return the highest number, assuming it's the upper limit of days
        else:
            return None  # Return "Undetermined" if no numbers are found

# Apply the function to the delivery_time column
df['delivery_time'] = df['delivery_time'].apply(extract_days)

TypeError: argument of type 'float' is not iterable

In [71]:
def extract_days(text):
    # Convert text to string to handle cases where text is not a string
    text = str(text)
    
    if pd.isnull(text) or "nicht mehr verfügbar" in text or "nicht lieferbar" in text or "ausverkauft" in text or "kein Liefertermin" in text:
        return None
    else:
        # Find all numbers in the string
        numbers = [int(num) for num in re.findall(r'\d+', text)]
        if numbers:
            return max(numbers)  # Return the highest number, assuming it's the upper limit of days
        else:
            return None  # Return None if no numbers are found

# Assuming you have a DataFrame 'df' with a column 'delivery_time'
df['delivery_time'] = df['delivery_time'].apply(extract_days)


<bound method Series.info of 0      2.0
1      2.0
2      2.0
3      2.0
4      2.0
      ... 
358    NaN
359    2.0
360    4.0
361    NaN
362    NaN
Name: delivery_time, Length: 357, dtype: float64>

In [69]:
df['delivery_time'].astype(float)
df['delivery_time'].info

<bound method Series.info of 0      2.0
1      2.0
2      2.0
3      2.0
4      2.0
      ... 
358    NaN
359    2.0
360    4.0
361    NaN
362    NaN
Name: delivery_time, Length: 357, dtype: float64>

In [None]:
df

In [None]:
df = df.drop(columns=['page', 'article_number', 'condition' ])

In [61]:
new_order = ['brand', 'model', 'category', 'size', 'storage', 'color', 'rating', 'n_of_reviews', 'delivery_time', 'price', 'source', 'date']
df = df[new_order]

In [62]:
file_name = "cleaned_mediamarkt.csv"
# Save the DataFrame to CSV in the same directory as the script
df.to_csv(file_name, index=False)