In [24]:
import random
from collections import OrderedDict
import requests
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt
from tabulate import tabulate
from colorama import Fore, Style
import sys


In [3]:

def generate_headers():
    accepted = [
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'text/html,application/xhtml+xml,image/jxr, */*',
        'text/html,application/xml;q=0.9,application/xhtml+xml,image/png,image/webp,image/jpeg,image/gif,image/x-xbitmap,*/*;q=0.1'
    ]

    with open('./user.txt', 'r') as f:
        userAgents = f.readlines()

    userAgent = str(random.choice(userAgents)).strip()
    platform = get_platform(userAgent)
    accept = random.choice(accepted)

    headers = OrderedDict([
        ("upgrade-insecure-requests", "1"),
        ("user-agent", userAgent),
        ("accept", accept),
        ("sec-ch-ua-mobile", "?0"),
        ("sec-ch-ua-platform", f"{platform}"),
        ("sec-fetch-site", "none"),
        ("sec-fetch-mode", "navigate"),
        ("sec-fetch-user", "?1"),
        ("accept-encoding", "gzip, deflate, br"),
        ("accept-language", "bg-BG,bg;q=0.9,en-US;q=0.8,en;q=0.7")
    ])

    return headers



In [4]:

def get_platform(userAgent):
    if 'Windows' in userAgent:
        return 'Windows'
    if 'Macintosh' in userAgent:
        return 'macOS'
    if 'CrOS' in userAgent:
        return 'Chrome OS'
    if 'Linux' in userAgent:
        return 'Linux'
    return 'Unknown'



In [5]:

def clean_price(price):
    if price == "N/A":
        return None
    price = price[1:].replace(',', '')
    try:
        return float(price)
    except ValueError:
        return None



In [6]:

def extract_inches(product_name):
    inches = None
    parts = product_name.split()
    for part in parts:
        if part.isdigit():
            inches = int(part)
            break
    return inches




In [8]:
def scrape_vijay_sales(headers):
    vijaysales_url = "https://www.vijaysales.com/tv-and-entertainment/type/buy-televisions-tv-and-entertainment?utm_source=google_search&utm_medium=cpc&utm_campaign=pt-google-vijaysales-gs-core-purchase-tv-na-in-all-14-02-24&utm_term=tvs%20deals&utm_content=690904233322&adgroup=generic-tv-exact&matchtype=e&devicemodel=&device=c&network=g&placement=&gad_source=1&gclid=Cj0KCQjwn7mwBhCiARIsAGoxjaIauBNX5nKcw6Cor5WjHY-oyfVUQMzH1vjMmyR0_Z7_0qPlPHoYMjMaAhsAEALw_wcB"
    scraped_data = []

    page = requests.get(url=vijaysales_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    product_titles = soup.find_all(class_='BcktPrdNm_')
    product_prices = soup.find_all('span', class_='Prdvsprc_')

    num_items = 0
    for title, price in zip(product_titles, product_prices):
        if num_items >= 10:
            break
        product_title = title.get_text().strip()
        product_price = clean_price(price.get_text().strip())  # Clean the price
        inches = extract_inches(product_title)
        scraped_data.append({'website': 'Vijay Sales', 'product_name': product_title, 'price': product_price, 'inches': inches})
        num_items += 1

    return scraped_data




In [9]:
def scrape_amazon(headers):
    amazon_product_url = "https://www.amazon.in/s?k=samsung+tv&crid=3Q0SDP9HU4N48&sprefix=samsung+tv%2Caps%2C475&ref=nb_sb_noss_1"
    scraped_data = []

    page = requests.get(url=amazon_product_url, headers=headers)
    soup = BeautifulSoup(page.content, 'lxml')

    product_titles = soup.find_all('span', class_='a-size-medium a-color-base a-text-normal')
    product_prices = soup.find_all('span', class_='a-price-whole')

    num_items = 0
    for title, price in zip(product_titles, product_prices):
        if num_items >= 10:
            break
        product_title = title.get_text().strip()
        product_price = clean_price(price.get_text().strip())  # Clean the price
        inches = extract_inches(product_title)
        scraped_data.append({'website': 'Amazon', 'product_name': product_title, 'price': product_price, 'inches': inches})
        num_items += 1

    return scraped_data




In [10]:
def scrape_kohinoor_electronics(headers):
    onbuy_product_url = 'https://kohinoorelectronics.com/shop/home-entertainment/televisions/'
    scraped_data = []

    page = requests.get(url=onbuy_product_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    product_titles = soup.find_all('a', class_='text-blue font-weight-bold')
    product_prices = soup.find_all('ins',  class_='font-size-20 text-blue-2 text-decoration-none')

    num_items = 0
    for title, price in zip(product_titles, product_prices):
        if num_items >= 10:
            break
        product_title = title.get_text().strip()
        product_price = clean_price(price.get_text().strip())  # Clean the price
        inches = extract_inches(product_title)
        scraped_data.append({'website': 'Kohinoor Electronics', 'product_name': product_title, 'price': product_price, 'inches': inches})
        num_items += 1

    return scraped_data



In [11]:

def filter_products_by_company(products, company_name):
    filtered_products = [product for product in products if product['website'] == company_name]
    return filtered_products


def filter_products_by_inches(products, inches):
    filtered_products = [product for product in products if product['inches'] == inches]
    return filtered_products


def get_all_company_names(data_amazon, data_vijay_sales, data_kohinoor_electronics):
    all_company_names = set(item['website'] for item in data_amazon + data_vijay_sales + data_kohinoor_electronics)
    return all_company_names


def sort_products_by_price(products):
    sorted_products = sorted(products, key=lambda x: x['price'] if x['price'] is not None else float('inf'))
    return sorted_products



In [12]:

def store_data_in_csv(data_amazon, data_vijay_sales, data_kohinoor_electronics):
    with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['website', 'product_name', 'price', 'inches']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()


        for item in data_amazon:
            writer.writerow({'website': 'Amazon', 'product_name': item['product_name'], 'price': item['price'], 'inches': item['inches']})

        
        for item in data_vijay_sales:
            writer.writerow({'website': 'Vijay Sales', 'product_name': item['product_name'], 'price': item['price'], 'inches': item['inches']})

        
        for item in data_kohinoor_electronics:
            writer.writerow({'website': 'Kohinoor Electronics', 'product_name': item['product_name'], 'price': item['price'], 'inches': item['inches']})



In [13]:
def plot_company_contribution(data_amazon, data_vijay_sales, data_kohinoor_electronics):
    company_counts = {}
    for data in [data_amazon, data_vijay_sales, data_kohinoor_electronics]:
        for item in data:
            company = item['website']
            if company not in company_counts:
                company_counts[company] = 0
            company_counts[company] += 1

    plt.figure(figsize=(16, 6))
    colors = ['blue', 'orange', 'green']
    labels = company_counts.keys()
    plt.pie(company_counts.values(), labels=labels, colors=colors, startangle=90, counterclock=False, autopct='%1.1f%%')
    plt.title('Company Contribution in Fetched Products')
    plt.axis('equal')
    plt.show()


def plot_price_distribution(data_amazon, data_vijay_sales, data_kohinoor_electronics):
    prices = {'Amazon': [], 'Vijay Sales': [], 'Kohinoor Electronics': []}

    for item in data_amazon:
        if item['price']:
            prices['Amazon'].append(item['price'])
    for item in data_vijay_sales:
        if item['price']:
            prices['Vijay Sales'].append(item['price'])
    for item in data_kohinoor_electronics:
        if item['price']:
            prices['Kohinoor Electronics'].append(item['price'])

    plt.hist(prices.values(), bins=10, label=prices.keys())
    plt.title('Price Distribution')
    plt.xlabel('Price')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()




In [19]:
def clear_console():
    sys.stdout.write("\033[H\033[J")  # ANSI escape sequence to clear
    sys.stdout.flush()

In [31]:
def main():
   
    headers = generate_headers()
    data_amazon = scrape_amazon(headers)
    data_vijay_sales = scrape_vijay_sales(headers)
    data_kohinoor_electronics = scrape_kohinoor_electronics(headers)

    store_data_in_csv(data_amazon, data_vijay_sales, data_kohinoor_electronics)


    while True:

        clear_console()
        
        print("\nOptions:")
        print("1. Filter products by company name")
        print("2. Filter products by cm")
        print("3. Show raw data")
        print("4. Show all products sorted by price")
        print("5. Plot company contribution")
        print("6. Plot price distribution")
        print("7. Exit")

        choice = input("Enter your choice: ")

        if choice == '1':
            all_company_names = get_all_company_names(data_amazon, data_vijay_sales, data_kohinoor_electronics)
            print("Available Company Names:")
            for company_name in all_company_names:
                print(company_name)  # Print company names
            
            selected_company = input("Enter the name of the company: ")
            filtered_products = filter_products_by_company(data_amazon + data_vijay_sales + data_kohinoor_electronics, selected_company)
            
            if filtered_products:
                print(tabulate([(item['website'], item['product_name'], item['price'], item['inches']) for item in filtered_products],
                                headers=['Website', 'Product Name', 'Price', 'Inches'], tablefmt='pretty'))
            else:
                print("No products found for the selected company.")


                
        elif choice == '2':
                    print("Enter inches to filter products: ")
                    inches = int(input("Enter inches to filter products: "))
                    filtered_products = filter_products_by_inches(data_amazon + data_vijay_sales + data_kohinoor_electronics, inches)
                    print(tabulate([(item['website'], item['product_name'], item['price'], item['inches']) for item in filtered_products],
                                    headers=['Website', 'Product Name', 'Price', 'Inches'], tablefmt='pretty'))
                    

        elif choice == '3':
            
            print("\nAmazon Data:")
            print(tabulate([(item['website'], item['product_name'], item['price'], item['inches']) for item in data_amazon],
                           headers=['Website', 'Product Name', 'Price', 'Inches'], tablefmt='pretty'))
            print("\nVijay Sales Data:")
            print(tabulate([(item['website'], item['product_name'], item['price'], item['inches']) for item in data_vijay_sales],
                           headers=['Website', 'Product Name', 'Price', 'Inches'], tablefmt='pretty'))
            print("\nKohinoor Electronics Data:")
            print(tabulate([(item['website'], item['product_name'], item['price'], item['inches']) for item in data_kohinoor_electronics],
                           headers=['Website', 'Product Name', 'Price', 'Inches'], tablefmt='pretty'))
            

        elif choice == '4':
            all_products = data_amazon + data_vijay_sales + data_kohinoor_electronics
            sorted_products = sort_products_by_price(all_products)
            print(tabulate([(item['website'], item['product_name'], item['price'], item['inches']) for item in sorted_products],
                            headers=['Website', 'Product Name', 'Price', 'Inches'], tablefmt='pretty'))
            

        elif choice == '5':
            plot_company_contribution(data_amazon, data_vijay_sales, data_kohinoor_electronics)


        elif choice == '6':
            plot_price_distribution(data_amazon, data_vijay_sales, data_kohinoor_electronics)


        elif choice == '7':
            print("Exiting...")
            break


        else:
            print("Invalid choice. Please enter a valid option.")



In [32]:
if __name__ == "__main__":
    main()


[H[J
Options:
1. Filter products by company name
2. Filter products by cm
3. Show raw data
4. Show all products sorted by price
5. Plot company contribution
6. Plot price distribution
7. Exit
Enter inches to filter products: 
+---------+--------------+-------+--------+
| Website | Product Name | Price | Inches |
+---------+--------------+-------+--------+
+---------+--------------+-------+--------+
[H[J
Options:
1. Filter products by company name
2. Filter products by cm
3. Show raw data
4. Show all products sorted by price
5. Plot company contribution
6. Plot price distribution
7. Exit
Invalid choice. Please enter a valid option.
[H[J
Options:
1. Filter products by company name
2. Filter products by cm
3. Show raw data
4. Show all products sorted by price
5. Plot company contribution
6. Plot price distribution
7. Exit

Amazon Data:
+---------+-------------------------------------------------------------------------------------------------------------------+---------+--------+
| 