##### Importing required libraries.

In [None]:
import pandas as pd
import re
import time
from duckduckgo_search import DDGS
from urllib.parse import urlparse

##### Reading Company Names from dataset.
###### You can ask Company Names from user inputs and search.

In [None]:
df=pd.read_csv("company_names.csv")
df.head()

##### Fetching the first url obtained after searching each Company Names.
###### I've used duckduckgo API as it doesn't require API key or CSE ID. You can use other APIs like Google Custom Search API or googlesearch or BeautifulSoup llibraries.

In [None]:
def get_first_link(query):
    time.sleep(1)     #add delays to avoid triggering rate limits
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=1))
        return results[0]["href"] if results else None

df['First_Link'] = df['Company Names'].apply(get_first_link)
df

##### Common suffixes in company name to ignore.
###### These suffixes disturbs the verification logic as it gets included while fetching the initials of company names or while the comparing the company name with urls(relate when verification logic will be explained below)

In [None]:
# Common suffixes to ignore
exclude_words = {
    "inc", "in", "inc.", "llc", "llc.", "ltd", "ltd.", "co", "co.", "corp", "corp.", "company", "corporation",
    "plc", "plc.", "gmbh", "s.a.", "s.a.s.", "s.r.l.", "b.v.", "n.v.", "pvt", "pvt.", "pte", "pte.",
    "limited", "llp", "llp.", "lp", "lp.", "sa", "ag", "oy", "ab", "as", "sarl", "k.k.", "s.p.a."
}

##### Verifying if the obtained url is the link for official website of the Company with either of the 3 Conditions.
######  First: Check if the company name is fully present in the domain name.
######  Second: Check if the first word of the Company's Name is present in the domain name, excluding the suffixes.
######  Third: Check if the initials of the Company's Name are present in the domain name, in exact order excluding the suffixes.

In [None]:
def clean_text(text):
    #Remove special characters (-, _, ., etc.) and convert to lowercase
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', str(text)).lower().strip()

def extract_initials(company_name):
    #Extract initials from the company name while ignoring common suffixes
    words = clean_text(company_name).split()
    filtered_words = [word for word in words if word not in exclude_words]  #Remove unwanted words
    return "".join(word[0] for word in filtered_words)  #Take the initials of each word

def generate_word_list(company_name):
    #Generate a list of individual words from the company name, excluding common suffixes
    return list(word for word in clean_text(company_name).split() if word not in exclude_words)

def extract_domain(url):
    #Extract the main domain name from fetched urls
    if pd.isna(url):
        return ""
    return urlparse(url).netloc.replace("www.", "").split('.')[0]  #Extract domain part only

def is_official_website(company, url):
    if pd.isna(url) or not url.strip():
        return "No Website"

    domain = extract_domain(url)
    company_names_cleaned = clean_text(company)
    company_names_words = generate_word_list(company)
    company_names_initials = extract_initials(company)

    # Condition 1. Exact Match
    if company_names_cleaned in domain:
        return url

    # Condition 2. Check if the first word from company name exists in domain
    if company_names_words[0] in domain:
        return url

    # Condition 3. Check if initials appear as a substring in exact order
    if company_names_initials and company_names_initials in domain:
        return url

    return "No Website"

##### Validating the fetched links with verification logics.

In [None]:
df["Result"] = df.apply(lambda row: is_official_website(row["Company Names"], row["First_Link"]), axis=1)
df.tail()

##### Extract the required data to a new dataframe.

In [None]:
result=df[['Company Names','Result']]
result

##### Export the required data from output to excel file.

In [None]:
result.to_excel('Official_Websites.xlsx', index=False)