In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
# Define a list of dictionaries with company and website data
websites = [
    {"company": "Nestle", "url": "https://www.nestle.com"},
    {"company": "Dr. Reddy's Laboratories", "url": "https://www.drreddys.com"},
    {"company": "Coca Cola", "url": "https://www.coca-colacompany.com"},
    {"company": "Pfizer", "url": "https://www.pfizer.com"},
    {"company": "PepsiCo", "url": "https://www.pepsico.com"},
    {"company": "Johnson & Johnson", "url": "https://www.jnj.com"},
    {"company": "Danone", "url": "https://www.danone.com"},
    {"company": "Bayer", "url": "https://www.bayer.com"},
    {"company": "General Mills", "url": "https://www.generalmills.com"},
    {"company": "GlaxoSmithKline (GSK)", "url": "https://www.gsk.com"},
    {"company": "Kellogg's", "url": "https://www.kelloggs.com"},
    {"company": "Merck & Co.", "url": "https://www.merck.com"},
    {"company": "Unilever", "url": "https://www.unilever.com"},
    {"company": "Roche", "url": "https://www.roche.com"},
    {"company": "Nestle Waters", "url": "https://www.nestlewaters.com"},
    {"company": "Sanofi", "url": "https://www.sanofi.com"},
    {"company": "Mondelez International", "url": "https://www.mondelezinternational.com"},
    {"company": "Novartis", "url": "https://www.novartis.com"},
    {"company": "Kraft Heinz", "url": "https://www.kraftheinzcompany.com"},
    {"company": "Eli Lilly and Company", "url": "https://www.lilly.com"},
    {"company": "Tyson Foods", "url": "https://www.tysonfoods.com"},
    {"company": "Teva Pharmaceuticals", "url": "https://www.tevapharm.com"},
    {"company": "Mars, Incorporated", "url": "https://www.mars.com"},
    {"company": "AbbVie", "url": "https://www.abbvie.com"},
    {"company": "Campbell Soup Company", "url": "https://www.campbellsoupcompany.com"},
    {"company": "Amgen", "url": "https://www.amgen.com"},
    {"company": "Conagra Brands", "url": "https://www.conagrabrands.com"},
    {"company": "AstraZeneca", "url": "https://www.astrazeneca.com"},
    {"company": "Molson Coors", "url": "https://www.molsoncoors.com"},
    {"company": "Boehringer Ingelheim", "url": "https://www.boehringeringelheim.com"},
    {"company": "AB InBev", "url": "https://www.abinbev.com"},
    {"company": "BASF", "url": "https://www.basf.com"},
    {"company": "Diageo", "url": "https://www.diageo.com"},
    {"company": "Procter & Gamble (P&G)", "url": "https://www.pg.com"},
    {"company": "Heineken", "url": "https://www.theheinekencompany.com"},
    {"company": "Medtronic", "url": "https://www.medtronic.com"},
    {"company": "McKesson", "url": "https://www.mckesson.com"},
    {"company": "AmerisourceBergen", "url": "https://www.amerisourcebergen.com"},
    {"company": "Cardinal Health", "url": "https://www.cardinalhealth.com"},
    {"company": "Medline Industries", "url": "https://www.medline.com"}
]


In [3]:
# Validate URLs
valid_websites = []
for site in websites:
    if 'url' in site and isinstance(site['url'], str) and site['url'].startswith('http'):
        print(f"Valid URL: {site['url']}")
        valid_websites.append(site)
    else:
        print(f"Invalid URL: {site['url']}")

Valid URL: https://www.nestle.com
Valid URL: https://www.drreddys.com
Valid URL: https://www.coca-colacompany.com
Valid URL: https://www.pfizer.com
Valid URL: https://www.pepsico.com
Valid URL: https://www.jnj.com
Valid URL: https://www.danone.com
Valid URL: https://www.bayer.com
Valid URL: https://www.generalmills.com
Valid URL: https://www.gsk.com
Valid URL: https://www.kelloggs.com
Valid URL: https://www.merck.com
Valid URL: https://www.unilever.com
Valid URL: https://www.roche.com
Valid URL: https://www.nestlewaters.com
Valid URL: https://www.sanofi.com
Valid URL: https://www.mondelezinternational.com
Valid URL: https://www.novartis.com
Valid URL: https://www.kraftheinzcompany.com
Valid URL: https://www.lilly.com
Valid URL: https://www.tysonfoods.com
Valid URL: https://www.tevapharm.com
Valid URL: https://www.mars.com
Valid URL: https://www.abbvie.com
Valid URL: https://www.campbellsoupcompany.com
Valid URL: https://www.amgen.com
Valid URL: https://www.conagrabrands.com
Valid URL: 

In [4]:
def scrape_website(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Example extraction of text from the website
        text = soup.get_text(separator=' ', strip=True).lower()
        
        # Define keywords to search for
        fnb_keywords = ['beverage', 'cereal', 'milk', 'bakery', 'food', 'snacks']
        health_keywords = ['gut health', 'women\'s health', 'cognitive health', 'mental wellness', 'nutrition', 'supplements', 'well-being']
        manufacturer_keywords = ['plant', 'production', 'capacity', 'certifications', 'factory', 'processing']

        # Check if any keywords are present
        is_fnb = any(keyword in text for keyword in fnb_keywords)
        is_health_related = any(keyword in text for keyword in health_keywords)
        is_manufacturer = any(keyword in text for keyword in manufacturer_keywords)
        
        # Try extracting the title of the page as an example
        title = soup.title.string if soup.title else "No title found"
        
        # Return the result with additional information
        return {
            "url": url,
            "title": title,
            "is_fnb": is_fnb,
            "is_health_related": is_health_related,
            "is_manufacturer": is_manufacturer
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

In [5]:
data = []

for site in valid_websites:
    result = scrape_website(site["url"])
    if result:
        data.append({
            "Company": site["company"],
            "Website": site["url"],
            "Is_FNB": result["is_fnb"],
            "Is_Health_Related": result["is_health_related"],
            "Is_Manufacturer": result["is_manufacturer"]
        })

# Create a DataFrame from the main scraped data
df_main = pd.DataFrame(data)

# Save the main DataFrame to a CSV file
df_main.to_csv('companies_data.csv', index=False)

# Function to retry scraping
def retry_scrape(url, retries=3, delay=5):
    for attempt in range(retries):
        result = scrape_website(url)
        if result:
            return result
        print(f"Retrying {url} ({attempt + 1}/{retries})...")
        time.sleep(delay)
    return None

# Only scrape the URLs that had errors
additional_urls = [
    "https://www.kelloggs.com",
    "https://www.cardinalhealth.com"
]

results = []

for url in additional_urls:
    result = retry_scrape(url)
    if result:
        results.append(result)
    time.sleep(2)  # Sleep for 2 seconds between requests


Error scraping https://www.kelloggs.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error scraping https://www.kelloggs.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying https://www.kelloggs.com (1/3)...
Error scraping https://www.kelloggs.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying https://www.kelloggs.com (2/3)...
Error scraping https://www.kelloggs.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying https://www.kelloggs.com (3/3)...


In [8]:
# Save the results to a CSV file if there are any
if results:
    df_additional = pd.DataFrame(results)
    df_additional.to_csv("additional_scraped_data.csv", index=False)
else:
    print("No data scraped for the specified URLs.")

# Load the previously scraped data
df_main = pd.read_csv("companies_data.csv")

# Load the additional scraped data if it exists
try:
    df_additional = pd.read_csv("additional_scraped_data.csv")
    # Combine the two DataFrames
    df_combined = pd.concat([df_main, df_additional], ignore_index=True)
except FileNotFoundError:
    print("No additional data to combine.")

# Save the combined data to a new CSV file
df_combined.to_csv("combined_companies_data.csv", index=False)


In [9]:
scap_data = pd.read_csv("combined_companies_data.csv")
scap_data.head()

Unnamed: 0,Company,Website,Is_FNB,Is_Health_Related,Is_Manufacturer
0,Nestle,https://www.nestle.com,False,False,False
1,Dr. Reddy's Laboratories,https://www.drreddys.com,False,False,False
2,Coca Cola,https://www.coca-colacompany.com,True,False,False
3,Pfizer,https://www.pfizer.com,False,False,False
4,PepsiCo,https://www.pepsico.com,True,True,True
