In [None]:
#Tunnel through the proxy server 
def fetch_page_with_retry(url, proxies, max_retries=5, backoff_factor=1):
    """
    Fetches a page using a rotating proxy and retries on HTTP 429 (rate limit exceeded) errors using exponential backoff.
    
    :param url: URL to fetch.
    :param proxies: List of proxy servers to use.
    :param max_retries: Maximum number of retries.
    :param backoff_factor: Factor by which to multiply the delay for each retry.
    :return: The page source if successful, None otherwise.
    """
    for attempt in range(max_retries):
        proxy = random.choice(proxies)  # Select a random proxy
        proxy_dict = {"http": proxy, "https": proxy}
        
        try:
            response = requests.get(url, proxies=proxy_dict)
            response.raise_for_status()  # Raises exception for HTTP errors
            
            if response.status_code == 200:
                return response.text
            else:
                print(f"Failed to retrieve {url}: Status code {response.status_code}")
                return None
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                sleep_time = backoff_factor * (2 ** attempt)
                print(f"Rate limit exceeded with proxy {proxy}, waiting {sleep_time} seconds before retrying...")
                sleep(sleep_time)
            else:
                print(f"HTTP error encountered with proxy {proxy}: {e}")
                return None
        except Exception as e:
            print(f"An error occurred while fetching {url} with proxy {proxy}: {e}")
            return None
    print(f"Max retries exceeded for {url}.")
    return None

In [None]:
#Sleep timer activated to build exceptions
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                print("Rate limit exceeded, waiting before retrying...")
                time.sleep(10)
                continue 
        
        #General exception
        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")

In [None]:
#Getting the list of companies
with open('/Users/allensunny/Downloads/Company_list.txt', 'r') as file:
    content = file.read()

#Cleaning the company names 
def clean_company_name(name):
    
    # Remove common corporate designations
    cleaned_name = re.sub(r'\s*(Inc\.|Co\.|Corp\.|Corporation|Company|Incorporated|LLC|Group|Corp|Enterprises|International|Co\.\,|L\.P\.|Holdings|Inc|Co)\.?$', '', name, flags=re.IGNORECASE)
    return cleaned_name.strip()

# Split the cleaned text into a list based on multiple spaces as a separator
company_list = content.split('    ')  # Adjust the number of spaces based on how the original text is structured
company_list = [company.strip() for company in company_list if company.strip()]

# Further cleaning to ensure no leading/trailing spaces in company names
company_list = [company.strip() for company in company_list]

# Apply the cleaning function to each company name
cleaned_companies = [clean_company_name(company) for company in company_list]

In [None]:
#Function to take a funciton and summarize the AI usage 
def Foundation_model_partners(company_name, products):
    """
    Fetch web pages related to the company's AI usage and print a summary of the content.
    
    :param company_name: The name of the company to search for.
    """
    results = []  # List to store result dictionaries

    #Generate a random proxy 
    proxy = get_random_proxy()
    
    search_query = f"Users or companies or partners or collaborators of {company_name} {products} list"

    #Search with timeout
    def search_with_timeout(search_query, num_results, proxy, timeout, sleep_interval):
    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(search, search_query, num_results=num_results, proxy=proxy, timeout=timeout, sleep_interval=sleep_interval)
        try:
            return future.result(timeout=40)  
        except TimeoutError:
            print("Search function timed out")
            return None
    
    #Calling search function
    search_results = search_with_timeout(search_query, num_results=4, proxy=proxy, timeout=10, sleep_interval=20)
    
    
    for url in search_results:
        try:
            response = requests.get(url)
            response.raise_for_status()  # This will directly raise an exception for HTTP errors
            page_source = response.text

            #print(url)

        #Raising the selinium exception logic
        except requests.exceptions.RequestException:
            print(f"Failed to retrieve {url}: Status code {response.status_code}")
            
            #Here I call the selium function to run it 
            driver.get(url)
            page_source = driver.page_source
            page_source = response.text

            #print(url)
            
        #Regular runtime with the requests logic libary 
        page_source = response.text
        soup = BeautifulSoup(page_source, 'html.parser')

        #Inital cleaning
        for a_tag in soup.find_all('a'):
            a_tag.decompose()  # This removes the tag and its content
            
        # Extract text from the modified BeautifulSoup object
        text = soup.get_text(strip=True)

        #Calling the summarization function
        summarized_text = summarize_and_extract(text, target_phrases, summarizer, max_length=50, min_length=5, do_sample=False)
        
        # Append result to the list
        results.append({'URL': url, 'Summary': summarized_text, 'Company': company_name})
    
        results_df = pd.DataFrame(results)
    
    return results_df    

In [None]:
company_name = "Stability AI"
products = "Stable Diffusion 2"

In [None]:
results_df = Foundation_model_partners(company_name, products)