## Handle Imports 

In [1]:
import os

import time
import string
import random
from datetime import datetime
import pandas as pd
import tqdm as tqdm
import shutil
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from chromedriver_py import binary_path
from selenium.webdriver.common.keys import Keys

from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor, as_completed

## Create Greenwashing Definitions 

In [2]:
# Predefined question or query
predefined_definition = "Greenwashing is the deceptive act of employing behaviors, activities, or communication strategies that deliberately create an inflated perception of a company's environmental commitment and performance. This manipulation can involve unsubstantiated claims of sustainability, misleading marketing tactics, selective disclosure of environmental data, implementation of superficial eco-friendly practices, and misrepresentation of a product's environmental impact."

# Four predefined definitions

DESC_Deception_and_Misinformation = 'This is when companies engage in deceptive practices by advertising products as "eco-friendly" or "sustainable" without providing supporting evidence. By doing so, they mislead consumers and in the longer run also compromise genuine sustainability efforts. Their claims involve a range of misleading actions, including misrepresenting product ingredients or origins.'
DESC_Misleading_Communication = 'Companies highlight trivial environmental initiatives, such as recycling programs or energy-efficient manufacturing, but at the same time, they downplay or ignore significant environmental impacts like emissions and pollution. This is a facade of environmental responsibility which misleads stakeholders about the companys actual ecological footprint. The strategy is misleading by disproportion rather than outright fabrication.'
DESC_Selective_Disclosure = 'Companies selectively disclose positive environmental activities, cherry-picking data to present an overly positive view of their environmental impact. An example to this is highlighting achievements like participation in renewable energy initiatives while omitting information about activities that have a significant negative environmental impact (such as high levels of waste production or biodiversity loss). Companies shift the narrative and mislead stakeholders by omission by presenting a distorted image that hides the actual environmental cost of their operations.'
DESC_Greenwashing_as_Decoupling = 'Companies create a mask of being sustainable through branding and symbolic actions that are decoupled from their actual environmental impact. These strategies often involve initiatives that - while visually appealing or publicly engaging -, do not result in substantial improvements to the companys environmental performance. Examples include high-profile but limited scope sustainability campaigns or using "green" imagery in marketing materials. This misleads consumers by creating a misleading impression of environmental responsibility.'

# Initialize the Sentence-BERT model
model = SentenceTransformer('all-miniLM-L6-v2') # Or the lightweight all-miniLM-L6-v2

# Encode the predefined question to get its embedding
definition_embedding = model.encode(predefined_definition)

QUERY_Deception_and_Misinformation = model.encode(DESC_Deception_and_Misinformation)
QUERY_Misleading_Communication = model.encode(DESC_Misleading_Communication)
QUERY_Selective_Disclosure = model.encode(DESC_Selective_Disclosure)
QUERY_Greenwashing_as_Decoupling = model.encode(DESC_Greenwashing_as_Decoupling)

  torch.utils._pytree._register_pytree_node(


## Get Data From Google News

In [3]:
def open_webpage_and_search(file_name):
    
    # Intitialize webscraper
    service_object = Service(binary_path)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Uncomment this line if you don't want the browser window to open
    driver = webdriver.Chrome(service=service_object, options=options)
    
    # Open the homepage of Google News
    google_news_page = "https://news.google.com/home?hl=en-US&gl=US&ceid=US:en"
    driver.get(google_news_page)
    time.sleep(3)  # It's better to use WebDriverWait here as well for a more reliable wait
    
    # Wait for the element to be clickable and click it
    # Assuming this click is necessary before searching. If not, adjust accordingly.
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[2]/div/div/button/span'))
    ).click()
    
    # Wait for the input field to be clickable
    input_field = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, ".Ax4B8.ZAGvjd"))
    )
    
    # Send text to the input field and press ENTER
    input_field.send_keys(file_name + Keys.ENTER)

    time.sleep(3)  # Let the search results page load. Consider using WebDriverWait here too.

    # Return the current URL after performing the search
    current_url = driver.current_url
    driver.quit()  # Close the browser after fetching the URL
    return current_url


def get_the_data_searched(file_name, url):
    
    # Intitialize webscraper
    service_object = Service(binary_path)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Uncomment this line if you don't want the browser window to open
    driver = webdriver.Chrome(service=service_object, options=options)
    
    # Open the homepage of Google News
    url_search = url
    driver.get(url)
    
    # Assuming this click is necessary before searching. If not, adjust accordingly.
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[2]/div/div/button/span'))
    ).click()
    
    time.sleep(3)  # It's better to use WebDriverWait here as well for a more reliable wait

    # Collection of articles data including dates
    articles = []
    title_elements = driver.find_elements(By.CLASS_NAME, 'JtKRv')
    date_elements = driver.find_elements(By.CLASS_NAME, 'hvbAAd') # Assuming this class name is correct for dates
    
    if len(title_elements) != 0:

        if len(title_elements) == len(date_elements):  # Ensuring each title has a corresponding date
            for title_element, date_element in zip(title_elements, date_elements):
                link = title_element.get_attribute('href')
                title = title_element.text
                date = date_element.get_attribute('datetime')
                article_data = {
                    'Title': title,
                    'Original Link': link,
                    'Date': date,
                }
                articles.append(article_data)
        else:
            print("Mismatch in the number of titles and dates found.")
            print(f"Titles found: {len(title_elements)}")
            print(f"Dates found: {len(date_elements)}")
            print(f"Links found: {len(link_elements)}")

        # Convert initial collection into a DataFrame
        articles_df = pd.DataFrame(articles)

        # print(articles_df)
        print(len(articles_df['Title']))


        # Now iterate through each original link to get the final URL and site name
        # for index, row in articles_df.iterrows():
            # original_link = row['Original Link']
            # driver.get(original_link)
            # time.sleep(3)  # Allow time for any redirects and for the site to load
            # final_url = driver.current_url
            # parsed_url = urlparse(final_url)
            # site_name = parsed_url.netloc.replace('www.', '')  # Clean the site name
            # articles_df.at[index, 'Final Link'] = final_url
            # articles_df.at[index, 'Site Name'] = site_name

        driver.quit()

        # Now iterate through the titles in your DataFrame to calculate similarity scores
        similarity_scores_general = []
        
        similarity_scores1 = []
        similarity_scores2 = []
        similarity_scores3 = []
        similarity_scores4 = []
        
        for title in articles_df['Title']:
            title_embedding = model.encode(title)
            # Calculate cosine similarity
            similarity_score = util.pytorch_cos_sim(definition_embedding, title_embedding)
            similarity_scores_general.append(similarity_score.item())
            
            # NEWNEWNEW
            similarity_score1= util.pytorch_cos_sim(QUERY_Deception_and_Misinformation, title_embedding)
            similarity_scores1.append(similarity_score1.item())
            
            similarity_score2= util.pytorch_cos_sim(QUERY_Misleading_Communication, title_embedding)
            similarity_scores2.append(similarity_score2.item())
            
            similarity_score3= util.pytorch_cos_sim(QUERY_Selective_Disclosure, title_embedding)
            similarity_scores3.append(similarity_score3.item())
            
            similarity_score4= util.pytorch_cos_sim(QUERY_Greenwashing_as_Decoupling, title_embedding)
            similarity_scores4.append(similarity_score4.item())
            
        
        # NEWNEWNEW
        articles_df['General_Definition'] = similarity_scores_general
        articles_df['Deception_and_Misinformation'] = similarity_scores1
        articles_df['Misleading_Communication'] = similarity_scores2
        articles_df['Selective_Disclosure'] = similarity_scores3
        articles_df['Greenwashing_as_Decoupling'] = similarity_scores4

        # Sort the DataFrame based on similarity scores in descending order to see the most relevant articles first
        # articles_df = articles_df.sort_values(by='Similarity Score', ascending=False)
    else:
        article_data = {
                    'Title': "No search results",
                    'Original Link': "No search results",
                    'Date': "No search results",
                }
        
        articles.append(article_data)
        
        articles_df = pd.DataFrame(articles)
        
        print(len(articles_df['Title']))

    # Display the final DataFrame
    # print(articles_df)
    
    return articles_df

def save_to_excel_file(file_name, articles_df):
    current_date = datetime.now().strftime("%Y%m%d")

    # Ensure the file_name variable ends with '.xlsx'
    file_name_final = file_name + "_" + current_date +".xlsx"  # Example file name, adjusted to include .xlsx

    # Attempt to save the DataFrame to an Excel file again
    try:
        articles_df.to_excel(file_name_final, index=False)
        print(f"Data saved to {file_name}")
    except Exception as e:
        print(f"Failed to save data to Excel file: {e}")
    

def do_a_search_to_excel(search_term):
    
    url_for_page = open_webpage_and_search(search_term)
    
    articles_df = get_the_data_searched(search_term, url_for_page)
    
    save_to_excel_file(search_term, articles_df)

## Do Scoring on Retrieved Data

In [4]:
def analyze_companies_news(company_names):
    # Initialize DataFrame to hold the final results
    # results_df = pd.DataFrame(columns=['Company Name', 'Average Similarity Score', 'Number of Articles', 'Ranking'])
    
    # NEWNEWNEW
    
    results_df = pd.DataFrame(columns=['Company Name', 
                                       'Average General_Definition Score',
                                       'Average Deception_and_Misinformation Score',
                                       'Average Misleading_Communication Score',
                                       'Average Selective_Disclosure Score',
                                       'Average Greenwashing_as_Decoupling Score',
                                       'Number of Articles',
                                       'General_Definition Ranking',
                                       'Deception_and_Misinformation Ranking',
                                       'Misleading_Communication Ranking',
                                       'Selective_Disclosure Ranking',
                                       'Greenwashing_as_Decoupling Ranking'])
    
    for company_name in company_names:
        url_for_page = open_webpage_and_search(company_name)
        articles_df = get_the_data_searched(company_name, url_for_page)
        
        # Check if articles_df is not empty or if a specific condition for no results is met
        if not articles_df.empty and not (articles_df.iloc[0]['Title'] == "No search results"):
            # average_similarity_score = articles_df['Similarity Score'].mean() if 'Similarity Score' in articles_df.columns else 0
            
            # NEWNEWNEW
            average_similarity_score_all = articles_df['General_Definition'].mean() if 'General_Definition' in articles_df.columns else 0
            average_similarity_score1 = articles_df['Deception_and_Misinformation'].mean() if 'Deception_and_Misinformation' in articles_df.columns else 0
            average_similarity_score2 = articles_df['Misleading_Communication'].mean() if 'Misleading_Communication' in articles_df.columns else 0
            average_similarity_score3 = articles_df['Selective_Disclosure'].mean() if 'Selective_Disclosure' in articles_df.columns else 0
            average_similarity_score4 = articles_df['Greenwashing_as_Decoupling'].mean() if 'Greenwashing_as_Decoupling' in articles_df.columns else 0
            
            
            print(average_similarity_score1)
            print(average_similarity_score2)
            print(average_similarity_score3)
            print(average_similarity_score4)
            
            number_of_articles = len(articles_df)
            ranking_all = (average_similarity_score_all * number_of_articles)
            ranking1 = (average_similarity_score1 * number_of_articles)
            ranking2 = (average_similarity_score2 * number_of_articles)
            ranking3 = (average_similarity_score3 * number_of_articles)
            ranking4 = (average_similarity_score4 * number_of_articles)
            result = {'Company Name': company_name, 
                      'Average General_Definition Score': average_similarity_score_all,
                      'Average Deception_and_Misinformation Score': average_similarity_score1, 
                      'Average Misleading_Communication Score': average_similarity_score2, 
                      'Average Selective_Disclosure Score': average_similarity_score3, 
                      'Average Greenwashing_as_Decoupling Score': average_similarity_score4, 
                      'Number of Articles': number_of_articles,
                      'General_Definition Ranking': ranking_all,
                      'Deception_and_Misinformation Ranking': ranking1,
                      'Misleading_Communication Ranking': ranking2,
                      'Selective_Disclosure Ranking': ranking3,
                      'Greenwashing_as_Decoupling Ranking': ranking4}
        else:
            # Handle case with no articles found
            print("No Score")
            result = {'Company Name': company_name, 
                      'Average General_Definition Score': 0,
                      'Average Deception_and_Misinformation Score': 0, 
                      'Average Misleading_Communication Score': 0, 
                      'Average Selective_Disclosure Score': 0, 
                      'Average Greenwashing_as_Decoupling Score': 0, 
                      'Number of Articles': 0,
                      'General_Definition Ranking': 0,
                      'Deception_and_Misinformation Ranking': 0,
                      'Misleading_Communication Ranking': 0,
                      'Selective_Disclosure Ranking': 0,
                      'Greenwashing_as_Decoupling Ranking': 0}
        
        # Append the result to results_df
        results_df = results_df._append(result, ignore_index=True)
    
    results_df = results_df.sort_values(by='Deception_and_Misinformation Ranking', ascending=False)
    
    # Save results to an Excel file
    current_date = datetime.now().strftime("%Y%m%d")
    random_number = random.randint(10000, 99999)
    filename = f"company_news_analysis_{current_date}_{random_number}.xlsx"
    results_df.to_excel(filename, index=False)
    print(f"Results saved to {filename}")
    return filename

## List of Firms

In [5]:
# List of the names of the firms you want to score
company_names = [
    "Company 1",
    "Company 2",
    "Company 3",
    "Etc."
]

## Run The Program with Cocurrent Threading Included

In [6]:
# Function to divide the list of company names into batches
def divide_list_into_batches(lst, n):
    """Divides a list into n nearly equal parts."""
    k, m = divmod(len(lst), n)
    return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

# Functions to generate and create a new folder for saving files
def generate_folder_name(base="greenwashing_run_collection"):
    current_date = datetime.now().strftime("%Y%m%d")
    random_chars = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
    folder_name = f"{base}_{current_date}_{random_chars}"
    return folder_name

def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return folder_name

# Wrapper function to analyze news and move the output file to the specified folder
def analyze_companies_news_wrapper(search_terms, folder_name):
    filename = analyze_companies_news(search_terms)
    current_file_path = os.path.join(os.getcwd(), filename)
    target_file_path = os.path.join(folder_name, filename)
    if os.path.exists(current_file_path):
        shutil.move(current_file_path, target_file_path)
        print(f"File '{filename}' moved to folder '{folder_name}'.")
    else:
        print(f"Error: File '{filename}' not found in the current directory.")

# Generate and create the folder
folder_name = generate_folder_name()
create_folder(folder_name)

# Desired number of batches and threads
num_batches = 5  # Adjust as per your requirement
num_threads = min(num_batches, 5)  # Adjust based on system capabilities

# Generate batches from the list of company names
batches = divide_list_into_batches(company_names, num_batches)

# Convert company names in each batch to search terms
all_batches = [[f"{company} greenwashing" for company in batch] for batch in batches]

# Using ThreadPoolExecutor to run tasks concurrently
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(analyze_companies_news_wrapper, batch, folder_name) for batch in all_batches]
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as exc:
            print(f'Batch generated an exception: {exc}')
        else:
            print('Batch completed')

import pandas as pd

def merge_excel_files(folder_name, output_file_name):
    # List all Excel files in the folder
    excel_files = [f for f in os.listdir(folder_name) if f.endswith('.xlsx') or f.endswith('.xls')]
    
    # Initialize an empty DataFrame to hold all the data
    merged_df = pd.DataFrame()
    
    # Loop through the Excel files and append them to the merged_df DataFrame
    for file in excel_files:
        file_path = os.path.join(folder_name, file)
        df = pd.read_excel(file_path)
        merged_df = pd.concat([merged_df, df], ignore_index=True)
    
    # Save the merged DataFrame to a new Excel file
    output_path = os.path.join(folder_name, output_file_name)
    merged_df.to_excel(output_path, index=False)
    print(f"All Excel files have been merged into '{output_path}'.")

# After the ThreadPoolExecutor block completes, call the merge_excel_files function
output_file_name = "merged_excel_files.xlsx"  # You can customize the output file name
merge_excel_files(folder_name, output_file_name)

Results saved to company_news_analysis_20240622_36434.xlsx
File 'company_news_analysis_20240622_36434.xlsx' moved to folder 'greenwashing_run_collection_20240622_QoHXwl'.
Batch completed
78
99
64
77
0.3335249175628026
0.3919228794865119
0.3366131643072153
0.39439234118431044
Results saved to company_news_analysis_20240622_42340.xlsx
File 'company_news_analysis_20240622_42340.xlsx' moved to folder 'greenwashing_run_collection_20240622_QoHXwl'.
Batch completed
0.3646273992090213
0.42014908722855826
0.359499031357994
0.4400491548909081
Results saved to company_news_analysis_20240622_94343.xlsx
File 'company_news_analysis_20240622_94343.xlsx' moved to folder 'greenwashing_run_collection_20240622_QoHXwl'.
Batch completed
0.3000341334118275
0.33397615172725637
0.29700204369146377
0.3831297548022121
Results saved to company_news_analysis_20240622_49327.xlsx
File 'company_news_analysis_20240622_49327.xlsx' moved to folder 'greenwashing_run_collection_20240622_QoHXwl'.
Batch completed
0.3183832