In [1]:
from flair.models import SequenceTagger
from flair.data import Sentence


tagger = SequenceTagger.load('ner')
def extract_person_names(text,company,url):
    try:
        sentence = Sentence(text)
        tagger.predict(sentence)
        person_names = set(entity.text for entity in sentence.get_spans('ner') if entity.tag == 'PER')
        print(f'Done for company {company} and webpage {url}')
        ans =  list(person_names)

        ans = [s.replace(company, "") for s in ans]
        return ans
    except Exception as e:
        print(e)

        return []



2024-08-29 20:10:04,519 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [18]:
import requests
from bs4 import BeautifulSoup
import os
from PyPDF2 import PdfReader
from urllib.parse import urlparse

def extract_text(web_url, company):
    try:
        if web_url.lower().endswith('.pdf'):
            pdf_filename = os.path.basename(urlparse(web_url).path)
            
            if not os.path.exists('documents'):
                os.makedirs('documents')
            
            pdf_filepath = os.path.join('documents', pdf_filename)
            
            response = requests.get(web_url)
            with open(pdf_filepath, 'wb') as f:
                f.write(response.content)
            
            with open(pdf_filepath, 'rb') as f:
                reader = PdfReader(f)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
            
            print(f'Done for {web_url}')
            return text

        else:
            response = requests.get(web_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()
            print(f'Done for {web_url}')
            return text

    except Exception as e:
        print(f'Failed at {web_url} and {company}: {str(e)}')




In [33]:
import requests
from bs4 import BeautifulSoup

def google_search(query,num):
    # Encode the query string for the URL
    query = query.replace(' ', '+')
    url = f"https://www.google.com/search?q={query}?num={num}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status() 

    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    for item in soup.find_all('div', class_='yuRUbf'):
        a_tag = item.find('a')
        if a_tag and a_tag.get('href'):
            links.append(a_tag['href'])

    return links



In [20]:
company = input('Enter Company Name')

query = f'{company} UK Testimonials'

company_testimonials = google_search(query,10)
company_testimonials

['https://www.trustpilot.com/review/www.barclays.co.uk',
 'https://www.barclayscorporate.com/client-experience/client-stories/',
 'https://www.reviews.io/company-reviews/store/barclays',
 'https://www.barclays.co.uk/business-banking/business-insight/feedback-economy/',
 'https://www.glassdoor.co.in/Reviews/Barclays-Reviews-E3456.htm',
 'https://www.ib.barclays/investment-banking/client-stories.html',
 'https://www.glassdoor.co.uk/Reviews/Barclays-Reviews-E3456.htm',
 'https://www.salesforce.com/uk/customer-success-stories/barclays/',
 'https://www.tripadvisor.com/ShowUserReviews-g190762-d21340132-r935930825-Barclays_Bank_Building-Wolverhampton_West_Midlands_England.html',
 'https://home.barclays/who-we-are/']

In [21]:
import concurrent.futures

company_testimonials = [{
    'company':company,
    'web_url': company_testimonials
}]
def process_testimonial(comp):
    company = comp['company']
    web_sites = []
    for url in comp['web_url']:
        text = extract_text(url, company) 

        if text:
            web_sites.append({'company': company, 'web_content': str(text),'url':url})
    return web_sites

all_web_sites = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_testimonial, company_testimonials)

    for result in results:
        all_web_sites.extend(result)



Done for https://www.trustpilot.com/review/www.barclays.co.uk
Done for https://www.barclayscorporate.com/client-experience/client-stories/
Done for https://www.reviews.io/company-reviews/store/barclays
Done for https://www.barclays.co.uk/business-banking/business-insight/feedback-economy/
Done for https://www.glassdoor.co.in/Reviews/Barclays-Reviews-E3456.htm
Done for https://www.ib.barclays/investment-banking/client-stories.html
Done for https://www.glassdoor.co.uk/Reviews/Barclays-Reviews-E3456.htm
Done for https://www.salesforce.com/uk/customer-success-stories/barclays/
Done for https://www.tripadvisor.com/ShowUserReviews-g190762-d21340132-r935930825-Barclays_Bank_Building-Wolverhampton_West_Midlands_England.html
Done for https://home.barclays/who-we-are/


In [22]:
row_list = []

def process_website(part):
    try:
        text = part['web_content']
        company = part['company']
        url = part['url']
        names = extract_person_names(text,company,url)
        
        return [{'Name': name, 'Company': company,'Testimonial Page':url} for name in names]
    except:
        print(names)
        return []

for web_site in all_web_sites:
    result = process_website(web_site)
    row_list.extend(result)       

Done for company Barclays and webpage https://www.trustpilot.com/review/www.barclays.co.uk
Done for company Barclays and webpage https://www.barclayscorporate.com/client-experience/client-stories/
Done for company Barclays and webpage https://www.reviews.io/company-reviews/store/barclays
Done for company Barclays and webpage https://www.barclays.co.uk/business-banking/business-insight/feedback-economy/
Done for company Barclays and webpage https://www.glassdoor.co.in/Reviews/Barclays-Reviews-E3456.htm
Done for company Barclays and webpage https://www.ib.barclays/investment-banking/client-stories.html
Done for company Barclays and webpage https://www.glassdoor.co.uk/Reviews/Barclays-Reviews-E3456.htm
Done for company Barclays and webpage https://www.salesforce.com/uk/customer-success-stories/barclays/
Done for company Barclays and webpage https://www.tripadvisor.com/ShowUserReviews-g190762-d21340132-r935930825-Barclays_Bank_Building-Wolverhampton_West_Midlands_England.html
Done for comp

In [23]:
len(row_list)

81

In [24]:
row_list

[{'Name': 'Marcelo',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.trustpilot.com/review/www.barclays.co.uk'},
 {'Name': 'Sneha',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.trustpilot.com/review/www.barclays.co.uk'},
 {'Name': 'Sneha Bhambhani',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.trustpilot.com/review/www.barclays.co.uk'},
 {'Name': 'Diane',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.trustpilot.com/review/www.barclays.co.uk'},
 {'Name': 'Diane Drew',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.trustpilot.com/review/www.barclays.co.uk'},
 {'Name': 'Hakim',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.trustpilot.com/review/www.barclays.co.uk'},
 {'Name': 'Hossein Zaimi',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.ib.barclays/investment-banking/client-stories.html'},
 {'Name': 'Adrian Beidas',
  'Company': 'Barclays',
  'Testimonial Page': 'https://www.ib.barclays/inves

In [25]:
import pandas as pd

df = pd.DataFrame(row_list)


In [34]:
for index,row in df.iterrows():
    try:
        name = row['Name']
        query = f'{name} UK profile linkedin'
        link = google_search(query,1)[0]
        df.at[index,'Linkedin Profile'] = link
        print(f'Done for index {index}')
    except Exception as e:
        print(e)
        pass
    

Done for index 0
Done for index 1
Done for index 2
Done for index 3
Done for index 4
Done for index 5
Done for index 6
Done for index 7
Done for index 8
Done for index 9
Done for index 10
Done for index 11
Done for index 12
Done for index 13
Done for index 14
Done for index 15
Done for index 16
Done for index 17
Done for index 18
Done for index 19
Done for index 20
Done for index 21
Done for index 22
Done for index 23
Done for index 24
Done for index 25
Done for index 26
Done for index 27
Done for index 28
Done for index 29
Done for index 30
Done for index 31
Done for index 32
Done for index 33
('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Done for index 35
Done for index 36
Done for index 37
Done for index 38
Done for index 39
Done for index 40
Done for index 41
Done for index 42
Done for index 43
429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DArif%2BVohra%2B%2BUK%2Bprofile%2B

In [37]:
df.to_excel('Updated_list.xlsx',index=False)

In [38]:
import openpyxl
from openpyxl.utils import get_column_letter

def convert_urls_to_hyperlinks(excel_file, sheet_name, url_column):
    workbook = openpyxl.load_workbook(excel_file)
    sheet = workbook[sheet_name]

    col_letter = get_column_letter(url_column)

    for row in range(2, sheet.max_row + 1): 
        cell = sheet[f"{col_letter}{row}"]
        if isinstance(cell.value, str) and cell.value.startswith("http"):
            sheet[f"{col_letter}{row}"].hyperlink = cell.value
            sheet[f"{col_letter}{row}"].value = 'Link'
            sheet[f"{col_letter}{row}"].style = "Hyperlink"

    workbook.save(excel_file)

excel_file = 'Updated_list.xlsx'
sheet_name = "Sheet1"
url_column = 3  

convert_urls_to_hyperlinks(excel_file, sheet_name, url_column)
url_column = 4
convert_urls_to_hyperlinks(excel_file, sheet_name, url_column)