In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def extract_specifications(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    image_tag = soup.find('img', class_='aps-image-zoom')
    image_url = image_tag['src'] if image_tag else 'Image URL not found'

    # Extract title
    title_tag = soup.find('h1', class_='aps-main-title')
    title = title_tag.text.strip() if title_tag else 'Title not found'

    # Extract features section
    features_section = soup.find('div', class_='aps-main-features')

    # Extract price and discount
    price_tag = features_section.find('span', class_='aps-price-value')
    price = price_tag.text.strip() if price_tag else 'Price not found'

    # Assuming 'soup' is your BeautifulSoup object
    overall_rating_span = soup.find('span', {'class': 'aps-rating-total', 'data-type': 'num'})
    rating_value = overall_rating_span.text.strip() if overall_rating_span else 'Rating not found'
        

    discount_tag = features_section.find('span', class_='aps-product-discount')
    discount = discount_tag.text.strip() if discount_tag else 'Discount not found'

    # Extract brand, category, and other details
    brand_tag = features_section.find('span', class_='aps-product-brand')
    brand = brand_tag.text.strip() if brand_tag else 'Brand not found'

    category_tag = features_section.find('span', class_='aps-product-cat')
    category = category_tag.text.strip() if category_tag else 'Category not found'

    # Initialize a dictionary to store extracted specifications
    specifications = {}

    # Find the specifications container
    specs_container = soup.find('div', {'id': 'aps-specs'})

    if specs_container:
        # Find all groups within the specifications container
        groups = specs_container.find_all('div', {'class': 'aps-group'})

        for group in groups:
            # Extract group title
            group_title = group.find('h3', {'class': 'aps-group-title'}).text.strip()

            # Extract specifications within the group
            specs = {}
            rows = group.find('table', {'class': 'aps-specs-table'}).find('tbody').find_all('tr')
            for row in rows:
                columns = row.find_all(['td', 'th'])
                if len(columns) == 2:
                    attr_title = columns[0].text.strip()
                    attr_value = columns[1].text.strip()

                    # Replace '\r\n' with a line break
                    attr_title = attr_title.replace('\r\n', '<br>')
                    attr_value = attr_value.replace('\r\n', '<br>')
                    
                    # List of texts to be removed
                    texts_to_remove = [
                        "\nrefers to the release or launch date of the device.",
                        "\nrefers to mobile network capability and network technology.",
                        "\nrefers to the type of display used e.g. AMOLED, OLED, IPS, TFT, and others",
                        "\nrefers to the operating system used by the device e.g. Android, iOS, Windows and others.",
                        "\nrefers to price per device variants (e.g. 4GB / 64GB, 8GB /256GB).",
                        "\nrefers to price status (e.g. official, conversion, estimated and others)"
                    ]

                    # Replacement text (if needed, otherwise an empty string works to simply remove the text)
                    replacement_text = ""

                    # Loop through each target text and replace it with the replacement text
                    for target_text in texts_to_remove:
                        attr_title = attr_title.replace(target_text, replacement_text)

     


                    specs[attr_title] = attr_value

                    specs[attr_title] = attr_value

            # Add specs to the specifications dictionary
            specifications[group_title] = specs

        # Replace specific text
        text_to_replace = "Launch Date\nrefers to the release or launch date of the device."
        replacement_text = "Launch Date:"
        
        for group_title, specs in specifications.items():
            for attr_title, attr_value in specs.items():
                if text_to_replace in attr_value:
                    specifications[group_title][attr_title] = replacement_text
    return {
        'Title': title,
        'Image URL': image_url,
        'Price': price,
        'Discount': discount,
        'Brand': brand,
        'Category': category,
        'Rating':rating_value,
        'Specifications': specifications
    }

def main():
    # Read URLs from clean_url.txt
    with open('cleaned_urls.txt', 'r') as file:
        urls = file.read().splitlines()

    # Initialize a list to store extracted data
    data_list = []
    i=0
    for url in urls:
        # if (i==5):
        #     break
        i+=1
        # Fetch HTML content
        response = requests.get(url)

        if response.status_code == 200:
            # Extract specifications from the HTML content
            html_content = response.text
            specifications = extract_specifications(html_content)

            # Add the data to the list
            data_list.append(specifications)
        else:
            print(f"Failed to fetch content for URL: {url}")

    # Write data to CSV
    with open('output_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Title', 'Image URL', 'Price', 'Discount', 'Brand', 'Category','Rating', 'Specifications']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write header
        writer.writeheader()

        # Write data
        for data in data_list:
            writer.writerow(data)

if __name__ == "__main__":
    main()


In [1]:
# Python script to extract URLs from a text file and save them to another file

def clean_urls(input_file, output_file):
    try:
        with open(input_file, 'r') as file:
            lines = file.readlines()

        urls = [line.split('\t')[0] for line in lines]  # Extracting the URL part

        with open(output_file, 'w') as file:
            for url in urls:
                file.write(url + '\n')

        print("URLs have been successfully extracted and saved to", output_file)

    except FileNotFoundError:
        print(f"The file {input_file} was not found. Please check the file name and try again.")

# Specify the input and output file names
input_file_name = 'url.txt'
output_file_name = 'cleaned_urls.txt'

# Call the function with the file names
clean_urls(input_file_name, output_file_name)


URLs have been successfully extracted and saved to cleaned_urls.txt


In [3]:
import requests
from bs4 import BeautifulSoup

# Function to extract specifications from HTML content
def extract_specifications(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # Extract image URL
    
    image_tag = soup.find('img', class_='aps-image-zoom')
    image_url = image_tag['src'] if image_tag else 'Image URL not found'

    # Extract title
    title_tag = soup.find('h1', class_='aps-main-title')
    title = title_tag.text.strip() if title_tag else 'Title not found'

    # Extract features section
    features_section = soup.find('div', class_='aps-main-features')

    # Extract price and discount
    price_tag = features_section.find('span', class_='aps-price-value')
    price = price_tag.text.strip() if price_tag else 'Price not found'

    discount_tag = features_section.find('span', class_='aps-product-discount')
    discount = discount_tag.text.strip() if discount_tag else 'Discount not found'

    # Extract brand, category, and other details
    brand_tag = features_section.find('span', class_='aps-product-brand')
    brand = brand_tag.text.strip() if brand_tag else 'Brand not found'

    category_tag = features_section.find('span', class_='aps-product-cat')
    category = category_tag.text.strip() if category_tag else 'Category not found'

    # Find the specifications section
    specifications_section = soup.find('div', class_='aps-column')

    # Initialize a dictionary to store extracted specifications
    specifications = {}

    if specifications_section:
        # Find all groups within the specifications section
        specification_groups = specifications_section.find_all('div', class_='aps-group')

        # Loop through each group
        for group in specification_groups:
            # Find the group title
            group_title = group.find('h3', class_='aps-group-title').text.strip()

            # Find the specs table within the group
            specs_table = group.find('table', class_='aps-specs-table')
            if specs_table:
                # Extract individual specifications within the table
                specs = {}
                rows = specs_table.find_all('tr')
                for row in rows:
                    columns = row.find_all(['td', 'th'])
                    if len(columns) == 2:
                        attr_title = columns[0].text.strip()
                        attr_value = columns[1].text.strip()
                        specs[attr_title] = attr_value

                # Add specs to the specifications dictionary under the group title
                specifications[group_title] = specs
    return {
        'Title': title,
        'Image URL': image_url,
        'Price': price,
        'Discount': discount,
        'Brand': brand,
        'Category': category,
        'Specifications': specifications
    }
   

# Specify the path to your cleaned URLs file
cleaned_urls_file = 'cleaned_urls.txt'

# Read URLs from the file
with open(cleaned_urls_file, 'r') as file:
    urls = [line.strip() for line in file.readlines()]

# Loop through each URL
for url in urls:
    # Fetch HTML content
    response = requests.get(url)
    
    if response.status_code == 200:
        # Extract specifications from the HTML content
        html_content = response.text
        attributes = extract_specifications(html_content)
        
        print(f"URL: {url}")
        print(attributes)
        print("\n")
    else:
        print(f"Failed to fetch content for URL: {url}")


URL: https://www.gadgetmaya.com/product/wiko-t3/
{'Title': 'WIKO T3', 'Image URL': 'https://www.gadgetmaya.com/wp-content/uploads/2022/06/Wiko-T3-800x630.jpg', 'Price': 'PHP7,490.00 PHP6,690.00', 'Discount': 'Price-drop:  PHP800.00 (OFF)', 'Brand': 'Brand:  Wiko', 'Category': 'Category:  Smartphones', 'Specifications': {}}


URL: https://www.gadgetmaya.com/product/huawei-p50/
{'Title': 'Huawei P50', 'Image URL': 'https://www.gadgetmaya.com/wp-content/uploads/2022/08/Huawei-P50.jpg', 'Price': 'PHP39,999.00', 'Discount': 'Discount not found', 'Brand': 'Brand:  Huawei', 'Category': 'Category:  Smartphones', 'Specifications': {}}


URL: https://www.gadgetmaya.com/product/huawei-p50-pocket/
{'Title': 'Huawei P50 Pocket', 'Image URL': 'https://www.gadgetmaya.com/wp-content/uploads/2022/08/Huawei-P50-Pocket.jpg', 'Price': 'PHP69,999.00 PHP59,999.00', 'Discount': 'Price-drop:  PHP10,000.00 (OFF)', 'Brand': 'Brand:  Huawei', 'Category': 'Category:  Smartphones', 'Specifications': {}}


URL: htt

KeyboardInterrupt: 

In [7]:
import requests
from bs4 import BeautifulSoup

# Function to extract specifications from HTML content
def extract_specifications(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the specifications section
    specifications_section = soup.find('div', class_='aps-column')

    # Initialize a dictionary to store extracted specifications
    specifications = {}

    if specifications_section:
        # Find all groups within the specifications section
        specification_groups = specifications_section.find_all('div', class_='aps-group')
        print(specification_groups)
        # Loop through each group
        for group in specification_groups:
            # Find the group title
            group_title = group.find('h3', class_='aps-group-title').text.strip()

            # Find the specs table within the group
            specs_table = group.find('table', class_='aps-specs-table')
            if specs_table:
                
                # Extract individual specifications within the table
                specs = {}
                rows = specs_table.find_all('tr')
                for row in rows:
                    columns = row.find_all(['td', 'th'])
                    if len(columns) == 2:
                        attr_title = columns[0].text.strip()
                        attr_value = columns[1].text.strip()
                        specs[attr_title] = attr_value

                # Add specs to the specifications dictionary under the group title
                specifications[group_title] = specs

    return specifications

# Specify the path to your cleaned URLs file
cleaned_urls_file = 'cleaned_urls.txt'

# Read URLs from the file
with open(cleaned_urls_file, 'r') as file:
    urls = [line.strip() for line in file.readlines()]

# Loop through each URL
for url in urls:
    # Fetch HTML content
    response = requests.get(url)
    
    if response.status_code == 200:
        # Extract specifications from the HTML content
        html_content = response.text
        specifications = extract_specifications(html_content)
        
        # Print the extracted specifications
        print(f"URL: {url}")
        for group_title, specs in specifications.items():
            print(f"\n{group_title}:")
            for attr_title, attr_value in specs.items():
                print(f"{attr_title}: {attr_value}")
        print("\n")
    else:
        print(f"Failed to fetch content for URL: {url}")


[]
URL: https://www.gadgetmaya.com/product/wiko-t3/


[]
URL: https://www.gadgetmaya.com/product/huawei-p50/


[]
URL: https://www.gadgetmaya.com/product/huawei-p50-pocket/




KeyboardInterrupt: 

In [17]:
import os
import pandas as pd
import ast

def remove_text_from_csv(csv_path, target_text):
    df = pd.read_csv(csv_path)
    
    if 'Specifications' in df.columns:
        df['Specifications'] = df['Specifications'].apply(ast.literal_eval)

        for index, specs_dict in df['Specifications'].items():
            if 'General' in specs_dict and 'Launch Date' in specs_dict['General']:
                launch_date = specs_dict['General']['Launch Date']
                
                # Check and replace the target text in 'Launch Date' with or without newline characters
                if target_text in launch_date:
                    df.at[index, 'Specifications']['General']['Launch Date'] = launch_date.replace(target_text, '').replace('\n', '')

        df.to_csv(csv_path, index=False)
        print(f"Text removed from the specified CSV file: {csv_path}")
    else:
        print("Error: 'Specifications' column not found in the DataFrame.")

# Path to the specific CSV file
csv_path = "output_data.csv"

# Target text to remove
target_text = "\nrefers to the release or launch date of the device."

# Apply text removal to the specified CSV file
remove_text_from_csv(csv_path, target_text)


Text removed from the specified CSV file: output_data.csv
