### Cleaning Data From Excel

In [1]:
import pandas as pd
import math
import os
import json

In [3]:
df = pd.read_excel("./products/dirty/laptops.xlsx", sheet_name="laptops")

## Start Cleaning

### Get Description

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1920 entries, 0 to 1919
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand           1440 non-null   object 
 1   description     1920 non-null   object 
 2   seller          249 non-null    object 
 3   price_amount    1920 non-null   float64
 4   price_discount  26 non-null     object 
 5   rating          87 non-null     float64
 6   stars           1920 non-null   bool   
 7   total_sales     87 non-null     float64
 8   shipping_info   950 non-null    object 
 9   img_url         1920 non-null   object 
 10  product_url     1920 non-null   object 
dtypes: bool(1), float64(3), object(7)
memory usage: 152.0+ KB


### Getting all Data

In [5]:
brands = list(df["brand"].fillna("No-Brand"))
descriptions = list(df.description)
sellers = list(df.seller.fillna("No-seller-identified"))
price_amounts = list(df.price_amount)
price_discounts = list(df.price_discount.fillna(0))
ratings = list(df.rating.fillna(0))
starts = list(df.stars)
total_sales = list(df.total_sales.fillna(0))
shipping_info = list(df.shipping_info.fillna("ENVÍO PAGO"))
img_urls = list(df.img_url.fillna("No-img-url"))
products_urls = list(df.product_url.fillna("No-product-url"))


### Checking if the list are in the same length.

In [6]:
print(f"""
Brand: {type(brands)} | length: {len(brands)}
description: {type(descriptions)} | length: {len(descriptions)}
sellers: {type(sellers)} | length: {len(sellers)}
price_amounts: {type(price_amounts)} | length: {len(price_amounts)}
price_discounts: {type(price_discounts)} | length: {len(price_discounts)}
ratigns: {type(ratings)} | length: {len(ratings)}
starts: {type(starts)} | length: {len(starts)}
total_sales: {type(total_sales)} | length: {len(total_sales)}
shipping_info: {type(shipping_info)} | length: {len(shipping_info)}
img-urls: {type(img_urls)} | length: {len(img_urls)}
products-urls: {type(products_urls)} | length: {len(products_urls)}
""")


Brand: <class 'list'> | length: 1920
description: <class 'list'> | length: 1920
sellers: <class 'list'> | length: 1920
price_amounts: <class 'list'> | length: 1920
price_discounts: <class 'list'> | length: 1920
ratigns: <class 'list'> | length: 1920
starts: <class 'list'> | length: 1920
total_sales: <class 'list'> | length: 1920
shipping_info: <class 'list'> | length: 1920
img-urls: <class 'list'> | length: 1920
products-urls: <class 'list'> | length: 1920



### Laptops Brand List

In [7]:
laptop_brands = [
    "DELL","LENOVO","HP","ASUS","ACER","SAMSUNG","SONY","MICROSOFT","MSI","APPLE","MAC","MACBOOK","GATEWAY","CHUWI",
    "TOSHIBA","ALIENWARE","RAZER","LG","HUAWEI","XIAOMI","HONOR","FUJITSU","PANASONIC","VAIO","COMPAQ","VIT","UTECH",    "ONIX","MEDION","EVOO","POSITIVO","SÍRAGON","SIRAGON","JEMIP","HYUNDAI"
]

laptops_damaged = ["DAÑADA", "DAÑADO", "REPARAR", "REPUESTO", "REPUESTOS", "DEFECTUOSA"]


### Cleaning Brands

In [8]:
brand_cleaned = []
works = []#new field added

def damaged_laptop(description):  
    try:
        for dameged in laptops_damaged:
            if dameged.lower() in description:
                return "DAÑADA"        
        return "OPERATIVA"
    except Exception as e:
        print(f" Damage | Error: {e}")

def get_laptop_brand(brand,description):
    try:
        for laptop in laptop_brands:
            if laptop.lower() in description:
                return laptop
        return brand
    except Exception as e:
        print(f" Laptop Brand | Error: {e}")
    

for index, brand in enumerate(brands):
    description = descriptions[index].lower().split()
    if brand == "No-Brand":
        works.append(damaged_laptop(description))
        brand_cleaned.append(get_laptop_brand(brand,description))
    else:
        works.append(damaged_laptop(description))
        brand_cleaned.append(brand)


### Cleaning Sellers

In [9]:
sellers_cleaned = []

for seller in sellers:
    if seller.lower().startswith("por"):
        sellers_cleaned.append(seller.lower().replace("por","").lstrip().upper())
    else:
        sellers_cleaned.append(seller.upper())

### Cleaning Price 

In [10]:
price_cleaned = []

for price in price_amounts:
    price = str(price).replace(".0","")
    if len(price) >= 5:
        new_value = price.replace(".","")
        price_cleaned.append(int(new_value))
    else:
        price_cleaned.append(int(float(price)))
    

### Cleaning Price Discount

In [11]:
price_discounts_cleaned = []
for price in price_discounts:
    if price == 0:
        price_discounts_cleaned.append(price)
    else:
        price = int(price.lower().replace("% off",""))
        price_discounts_cleaned.append(price)

### Cleaning Ratings

In [12]:
ratings_cleaned = []
for rate in ratings:
    ratings_cleaned.append(math.floor(rate))

### Cleaning Shipping

In [13]:
shipping_info_cleaned = []
for shipping in shipping_info:
    shipping_info_cleaned.append(shipping.upper().replace("ENVÍO ",""))

### Check Lenth of all data

In [None]:
print(f"""
Modelo: {len(brand_cleaned)}
Descripcion: {len(descriptions)}
Vendedor: {len(sellers_cleaned)}
Precio: {len(price_cleaned)}
Descuento:  {len(price_discounts_cleaned)}
Ratign:  {len(ratings_cleaned)}
Stars: {len(starts)}
Ventas:  {len(total_sales)}
Envío:  length: {len(shipping_info_cleaned)}
Estado: {len(works)}
img-urls: {len(img_urls)}
products-urls: {len(products_urls)}
""")

### Making new Dict

In [15]:
product_data = {
        'Modelo': brand_cleaned,
        'Descripcion': descriptions,
        'Vendedor': sellers_cleaned,
        'Precio': price_cleaned,
        'Descuento': price_discounts_cleaned,
        'Rating': ratings_cleaned,
        'Stars': starts,
        'Ventas': total_sales,
        'Envío': shipping_info_cleaned,
        'Estado': works,
        'image_url': img_urls,
        'product_url': products_urls
    }

### Getting Excel format

In [16]:
def get_CSV(file: str, data):
    """
    Convert a list of product dictionaries into a pandas DataFrame.
    
    Args:
        products: List of product dictionaries containing scraped data
        
    Returns:
        pd.DataFrame: Structured dataframe containing all product information
    """
    try:
        excel_cleaned = pd.DataFrame(data)
        excel_cleaned.to_excel(f"./products/clean/{file}.xlsx", engine='openpyxl',index=False, sheet_name='laptops')
        # excel_cleaned.to_csv(f"./products/clean/{file}.csv", index=False, sep=";", encoding='utf-8-sig')
    except Exception as e:
        print(f" CSV | Error: {e}")
    

# get_CSV("laptops", product_data)
get_CSV("lap", product_data)

### Making JSON model

In [None]:
JSON_data = []

for index in range(0,1920):
    JSON_data.append({
        'Modelo': brand_cleaned[index],
        'Descripcion': descriptions[index],
        'Vendedor': sellers_cleaned[index],
        'Precio': price_cleaned[index],
        'Descuento': price_discounts_cleaned[index],
        'Rating': ratings_cleaned[index],
        'Stars': starts[index],
        'Ventas': total_sales[index],
        'Envío': shipping_info_cleaned[index],
        'Estado': works[index],
        'image_url': img_urls[index],
        'product_url': products_urls[index]
    })

### Getting JSON Format

In [None]:
def Get_JSON(file,laptops):
    """
    Convert a list of product dictionaries into a JSON format.
    
    Args:
        products: List of product dictionaries containing scraped data
        
    Returns:
        JSON: JSON formatted string of the product data
    """
    # root
    root = None

    # Make folder
    folder_path = "./products"
    os.makedirs(folder_path, exist_ok=True)

    #file root
    root = os.path.join(f"{folder_path}/clean/{file}.json")
    try:
        with open(root, "w") as f:
            json.dump([],f)

        def insert_JSON(data, fileName = root):
            with open(fileName, "r+", encoding= "utf-8") as file:
                file_data = json.load(file)
                file_data.append(data)
                file.seek(0)
                json.dump(file_data, file, indent= 4)

        for laptop in laptops:
            insert_JSON(laptop)
    except Exception as e:
        print(f" JSON | Error: {e}")

Get_JSON("laptops",JSON_data)