<a href="https://colab.research.google.com/github/Aastik01us/NLP-tokenization/blob/main/dataextraction2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📦 Step 1: Install required libraries
!pip install pandas requests beautifulsoup4

# 📚 Step 2: Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

# 📂 Step 3: Upload your "company_data_output 2.csv"
from google.colab import files
uploaded = files.upload()

# 📋 Step 4: Load the full company list
df = pd.read_csv(list(uploaded.keys())[0])
df["Team Size"] = ""
df["Industry"] = ""
df["Office Locations"] = ""

# 🧠 Step 5: Scraping functions
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115 Safari/537.36"
}

def scrape_linkedin_info(linkedin_url):
    try:
        response = requests.get(linkedin_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator=' ', strip=True)

        # Team Size
        team_match = re.search(r'(\d{1,3}(,\d{3})*(\+)?)( employees| staff| people)', text, re.IGNORECASE)
        team_size = team_match.group(1) if team_match else "Not Found"

        # Industry
        industry_match = re.search(r'Industry\s*([\w\s&,-]+)', text)
        industry = industry_match.group(1).strip() if industry_match else "Not Found"

        return team_size, industry
    except:
        return "Not Found", "Not Found"

def scrape_office_locations(website_url):
    try:
        response = requests.get(website_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator=' ', strip=True)

        # Basic location pattern match (e.g., city, country)
        addresses = re.findall(r'\b(?:[A-Z][a-z]+\s?)+(?:,?\s?(India|USA|UK|Germany|Singapore|Europe|Canada|UAE))\b', text)
        unique_locations = list(set(addresses))
        return ', '.join(unique_locations[:5]) if unique_locations else "Not Found"
    except:
        return "Not Found"

# 🔁 Step 6: Loop through all companies and enrich
for idx, row in df.iterrows():
    print(f"🔍 Processing: {row['Company']}...")

    linkedin_url = row["LinkedIn URL"]
    website_url = row["Website"]

    team_size, industry = scrape_linkedin_info(linkedin_url)
    location = scrape_office_locations(website_url)

    df.at[idx, "Team Size"] = team_size
    df.at[idx, "Industry"] = industry
    df.at[idx, "Office Locations"] = location

    time.sleep(2)  # 💤 polite delay to avoid being blocked

# 💾 Step 7: Save and download enriched data
df.to_csv("enriched_company_data_full.csv", index=False)
files.download("enriched_company_data_full.csv")
print("✅ Done! Full CSV downloaded.")




Saving company_data_output 2.csv to company_data_output 2.csv
🔍 Processing: TCS...
🔍 Processing: Accenture...
🔍 Processing: Cognizant...
🔍 Processing: ICICI Bank...
🔍 Processing: HDFC Bank...
🔍 Processing: Wipro...
🔍 Processing: Infosys...
🔍 Processing: Capgemini...
🔍 Processing: Tech Mahindra...
🔍 Processing: Genpact...
🔍 Processing: HCL Technologies...
🔍 Processing: Axis Bank...
🔍 Processing: Amazon...
🔍 Processing: IBM...
🔍 Processing: Concentrix Corporation...
🔍 Processing: Larsen & Toubro...
🔍 Processing: Reliance jio...
🔍 Processing: Vodafone Idea...
🔍 Processing: HDB Financial Services...
🔍 Processing: Teleperformance...
🔍 Processing: Kotak Mahindra Bank...
🔍 Processing: Reliance Industries...
🔍 Processing: Bharti Airtel...
🔍 Processing: Deloitte...
🔍 Processing: Tata Motors...
🔍 Processing: Reliance Retail...
🔍 Processing: WNS...
🔍 Processing: Mahindra & Mahindra...
🔍 Processing: IndusInd Bank...
🔍 Processing: Flipkart...
🔍 Processing: DXC Technology...
🔍 Processing: BYJU'S...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done! Full CSV downloaded.
