In [None]:
!pip install gradio #install gradio for Webapp
!pip install flair #flair for AI(NLP) skill detection

import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import random
import threading
import os

from flair.models import SequenceTagger
from flair.data import Sentence

from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Load Flair skill detection model
flair_model = SequenceTagger.load('kaliani/flair-ner-skill')

# LinkedIn URL mappings
experience_level_mapping= {
    "Internship":"f_E=1",
    "Entry level" : "f_E=2",
    "Associate" : "f_E=3",
    "Mid-Senior level" : "f_E=4",
}

work_type_mapping={
    "On-site" : "f_WT=1",
    "Hybrid" : "f_WT=2",
    "Remote" : "f_WT=3",
}

time_filter_mapping={
    "Past 24 hours":"f_TPR=r86400",
    "Past week":"f_TPR=r604800",
    "Past month":"f_TPR=r2592000",
}

# Function to extract skills using Flair
def get_skills(text):
    sentence = Sentence(text)
    flair_model.predict(sentence)
    return [entity.text for entity in sentence.get_spans('ner')]

# Scraper manager class
class ScraperManager:
    def __init__(self):
        self.stop_event = threading.Event()
        self.current_df = pd.DataFrame()
        self.lock = threading.Lock()

    def reset(self):
        self.stop_event.clear()
        self.current_df = pd.DataFrame()

    def add_job(self, job_data):
        with self.lock:
            new_df = pd.DataFrame([job_data])
            self.current_df = pd.concat([self.current_df, new_df], ignore_index=True)

scraper_manager = ScraperManager()

# Process individual job
def process_job(job, work_type, exp_level, position, tech_keywords=[]):
    try:
        title_element = job.find('h3', class_='base-search-card__title')
        company_element = job.find('a', class_='hidden-nested-link')
        loc_element = job.find('span', class_='job-search-card__location')
        link_element = job.find('a', class_='base-card__full-link')

        if not all([title_element, company_element, loc_element, link_element]):
            return None

        title = title_element.text.strip()
        company = company_element.text.strip()
        loc = loc_element.text.strip()
        link = link_element['href'].split('?')[0]

        session = requests.Session()
        retries = Retry(total=3, backoff_factor=1, status_forcelist=[429,500,502,503,504])
        session.mount('https://', HTTPAdapter(max_retries=retries))

        desc = "Description not available"
        skills = []

        try:
            time.sleep(random.uniform(2,5))
            response = session.get(
                link,
                headers={
                    'User-Agent': random.choice([
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
                        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
                    ]),
                    'Accept-Language': 'en-Us,en;q=0.9'
                },
                timeout=10
            )

            job_soup = BeautifulSoup(response.text,'html.parser')
            decription_selectors=[
                'div.description__text',
                'div.show-more-less-html_markup',
                'div.core-section-container__content',
                'section.core-section-container'
            ]

            for selector in decription_selectors:
                desc_element = job_soup.select_one(selector)
                if desc_element:
                    desc = desc_element.get_text('\n').strip()
                    skills = get_skills(desc)
                    break
        except Exception as e:
            print(f"Error processing {link}: {str(e)}")

        # Filter jobs by preferred tech keywords
        if tech_keywords:
            text_to_check = " ".join([title, desc] + skills).lower()
            if not any(keyword.lower() in text_to_check for keyword in tech_keywords):
                return None

        return {
            "Position": position,
            "Date": datetime.now().strftime('%Y-%m-%d'),
            "Work type": work_type,
            "Level": exp_level,
            "Title": title,
            "Company": company,
            "Location": loc,
            "Link": f"[{link}]({link})",
            "Description": desc,
            "Skills": ", ".join(skills[:5]) if skills else "No skills detected"
        }

    except Exception as e:
        print(f"Error processing job card: {str(e)}")
        return None

# Scrape jobs for locations, positions
def scrape_jobs(location, position, work_types, exp_levels, time_filter, tech_keywords=[]):
    session = requests.Session()
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[429,500,502,503,504])
    session.mount('https://', HTTPAdapter(max_retries=retries))

    for work_type in work_types:
        for exp_level in exp_levels:
            if scraper_manager.stop_event.is_set():
                return
            try:
                base_url = f"https://www.linkedin.com/jobs/search/?keywords={position}&location={location}" \
                           f"&{work_type_mapping[work_type]}" \
                           f"&{experience_level_mapping[exp_level]}" \
                           f"&{time_filter_mapping[time_filter]}" \
                           f"&radius=0"

                try:
                    response = session.get(base_url, timeout=10)
                    soup = BeautifulSoup(response.text, 'html.parser')
                    total_jobs = int(soup.find('span', class_='results-context-header__job-count').text.replace(',',''))
                except:
                    total_jobs=25

                total_jobs=min(total_jobs, 100)

                for start in range(0, total_jobs, 25):
                    if scraper_manager.stop_event.is_set():
                        return
                    time.sleep(random.uniform(2,5))
                    url = f"{base_url}&start={start}"
                    try:
                        response = session.get(url, timeout=10)
                        soup = BeautifulSoup(response.text, 'html.parser')
                        jobs = soup.find_all('div',class_='base-card')
                    except Exception as e:
                        print(f"Failed to scrape page {start}: {str(e)}")
                        continue

                    random.shuffle(jobs)
                    for job in jobs:
                        if scraper_manager.stop_event.is_set():
                            return
                        job_data = process_job(job, work_type, exp_level, position, tech_keywords)
                        if job_data:
                            scraper_manager.add_job(job_data)
                            yield

            except Exception as e:
                print(f"Scraping error: {str(e)}")

# Run scraper from Gradio
def run_scrapper(cities, states, positions, work_types, exp_levels, time_filter, tech_keywords):
    scraper_manager.reset()
    cities_list = [c.strip() for c in cities.split(',') if c.strip()]
    states_list = [s.strip() for s in states.split(',') if s.strip()]
    locations = [f"{city}, {state}" for city in cities_list for state in states_list]
    positions_list = [p.strip().replace(' ','%20') for p in positions.split(',') if p.strip()]
    tech_keywords_list = [t.strip() for t in tech_keywords.split(',') if t.strip()]

    def worker():
        for loc in locations:
            for pos in positions_list:
                if scraper_manager.stop_event.is_set():
                    return
                for _ in scrape_jobs(loc, pos, work_types, exp_levels, time_filter, tech_keywords_list):
                    pass

    thread = threading.Thread(target=worker)
    thread.start()

    while thread.is_alive():
        time.sleep(0.5)
        with scraper_manager.lock:
            yield 'Scraping in progress...', scraper_manager.current_df

    yield "Scraping Completed!" if not scraper_manager.stop_event.is_set() else "Scraping stopped!", scraper_manager.current_df

# Save to CSV
def save_csv(df, filename):
    if filename.strip() == "":
        filename = "my_jobs"
    file_path = f"{filename}.csv"
    try:
        df.to_csv(file_path, index=False)
        return f"Saved as {file_path}"
    except Exception as e:
        return f"Error saving CSV: {str(e)}"

# Gradio App
with gr.Blocks() as app:
    gr.Markdown("""
    <div style='text-align: center; color: #f67d3c; font-size: 2em; font-weight: bold; margin: 20px 0; padding: 10px;'>
        AI-Powered Linkedin Job Scraper
    </div>
    """)

    with gr.Row():
        with gr.Column():
            cities = gr.Textbox(label="Cities (comma-separated)")
            states = gr.Textbox(label="States/Countries (comma-separated)")
            positions = gr.Textbox(label="Positions (comma-separated)")
            work_types = gr.CheckboxGroup(list(work_type_mapping.keys()), label="Work Types")
            exp_levels = gr.CheckboxGroup(list(experience_level_mapping.keys()), label="Experience Levels")
            time_filter = gr.Dropdown(list(time_filter_mapping.keys()), label="Time Filter")
            tech_keywords = gr.Textbox(label="Preferred Technologies/Fields (comma-separated)", placeholder="Python, ML, AI")

            with gr.Row():
                start_btn = gr.Button("Start Scraping", variant="primary")
                stop_btn = gr.Button("Stop Scraping", variant="secondary")

        status = gr.Textbox(label="Status")
        results = gr.Dataframe(
            headers = ["Position", "Date", "Work type", "Level", "Title", "Company", "Location", "Link", "Skills"],
            datatype = ["str","str","str","str","str","str","str","str","str"],
            interactive = False
        )

        with gr.Row():
            filename = gr.Textbox(label="Filename (optional)", placeholder="my_jobs")
            save_btn = gr.Button("Save to csv", variant="secondary")
            save_status = gr.Textbox(label="Save status")

        start_btn.click(
            run_scrapper,
            inputs=[cities, states, positions, work_types, exp_levels, time_filter, tech_keywords],
            outputs=[status, results]
        )

        stop_btn.click(
            lambda: scraper_manager.stop_event.set(),
            outputs=[]
        )

        save_btn.click(
            save_csv,
            inputs=[results, filename],
            outputs=save_status
        )

if __name__ == "__main__":
    app.launch()
