In [13]:
!pip install sentence-transformers



In [80]:
!pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [165]:
import pandas as pd
import re
import re
import csv
import PyPDF2
from sentence_transformers import SentenceTransformer, util

# Predefined skills list
skills_list = [
    # Programming Languages
    'Python', 'Java', 'C++', 'C#', 'JavaScript', 'TypeScript', 'Go', 'Rust', 'Kotlin', 'Swift', 'Ruby', 'PHP', 'Scala', 'Perl', 'R', 'Matlab',

    # Web Development
    'HTML', 'CSS', 'React', 'Angular', 'Vue.js', 'Next.js', 'Node.js', 'Django', 'Flask', 'Express.js', 'Spring Boot', 'ASP.NET', 'Laravel',

    # Machine Learning / Data Science / AI
    'Machine Learning', 'Deep Learning', 'Artificial Intelligence', 'Computer Vision', 'NLP', 'TensorFlow', 'PyTorch', 
    'Keras', 'Scikit-Learn', 'OpenCV', 'Hugging Face', 'Transformers', 'XGBoost', 'LightGBM', 'Pandas', 'NumPy', 'Matplotlib', 'Seaborn',
    'Artificial Intelligence Markup Language', 'Language Model', 'Claude', 'ChatGPT', 'Gemini',

    # Databases
    'MySQL', 'PostgreSQL', 'MongoDB', 'SQLite', 'Oracle', 'Redis', 'Cassandra', 'Elasticsearch', 'Firebase', 'Firestore', 'MS SQL Server', 'DynamoDB',

    # Cloud Platforms
    'AWS', 'Amazon Web Services', 'Azure', 'Google Cloud', 'GCP', 'Google Cloud Platforms (GCP)', 'IBM Cloud', 'Heroku', 'DigitalOcean',

    # DevOps / Tools
    'Docker', 'Kubernetes', 'Jenkins', 'Git', 'GitHub', 'Bitbucket', 'Terraform', 'Ansible', 'Prometheus', 'Grafana', 'CI/CD', 'Agile',
    'Power BI', 'Tableau', 'Microsoft 365', 'Microsoft PowerApps', 'MS-Word', 'MS-Office', 'MS-Excel',

    # Mobile App Development
    'Android', 'iOS', 'Flutter', 'React Native', 'SwiftUI', 'Xamarin',

    # UI/UX / Design
    'Figma', 'Adobe XD', 'Adobe Photoshop', 'Illustrator', 'Sketch', 'UI & UX Design',

    # Cybersecurity
    'Ethical Hacking', 'Penetration Testing', 'Wireshark', 'Nmap', 'Burp Suite', 'Kali Linux',

    # Big Data
    'Hadoop', 'Spark', 'Kafka', 'Hive', 'Airflow',

    # Content Creation / Writing
    'Content Writing', 'Blogging', 'Creative Writing', 'Content Editing', 'Search Engine Optimization (SEO)', 'Google Analytics', 'English Proficiency (Spoken)', 'English Proficiency (Written)',

    # Project Management / Soft Skills
    'Business Analysis', 'Project Management', 'Scrum Master', 'Agile Methodology', 'Effective Communication',

    # Certifications and Other
    'Certificate', 'Letter of Recommendation', 'Flexible work hours', '5 days a week', 'Free snacks & beverages', 'Job offer', 'Informal dress code'
]

# Extra "Common Skills" to inject for better extraction
extra_common_skills_text = (
    "Content Writing, Blogging, Creative Writing, Content Editing, "
    "Google Analytics, English Proficiency (Spoken), English Proficiency (Written), "
    "Project Management, Effective Communication, "
    "Certificate, Letter of Recommendation, Flexible work hours, 5 days a week, Free snacks & beverages, Job offer, Informal dress code"
)

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf = PyPDF2.PdfReader(file)
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

def find_skills(text):
    """Find skills from text based on predefined skills list."""
    found_skills = []
    text_lower = text.lower()
    for skill in skills_list:
        if skill.lower() in text_lower:
            found_skills.append(skill)
    return found_skills

def process_text_and_save(text, output_csv):
    """Extract skills from given text and save full description along with each skill."""
    skills = find_skills(text)
    
    with open(output_csv, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Skill', 'Full Description'])
        
        if not skills:
            writer.writerow(['-', text])
        else:
            for skill in skills:
                writer.writerow([skill, text])

def main():
    """Main function to choose input type and process."""
    choice = input("Enter 1 for PDF input, 2 for Text input: ")

    if choice == '1':
        pdf_path = input("Enter the path to the PDF file: ")
        text = extract_text_from_pdf(pdf_path)
    elif choice == '2':
        print("Please enter your resume text directly:")
        text = input()  # Takes the entire resume text as one input (change to multi-line if needed)
    else:
        print("Invalid choice. Exiting.")
        return

    # Add extra common skills text for better matching
    text += extra_common_skills_text

    output_csv = 'resume_skills_output.csv'
    process_text_and_save(text, output_csv)
    print(f"✅ Extraction completed! Results saved to {output_csv}")


if __name__ == "__main__":
    main()


# Load pre-trained model once
model = SentenceTransformer('all-MiniLM-L6-v2')

# CRITICAL SKILLS
CRITICAL_SKILLS = {
    "data scientist": ["python", "machine learning"],
    "data analyst": ["sql", "data analysis"],
    "ui/ux designer": ["figma", "wireframing"],
    "frontend developer": ["html", "css", "javascript"],
    "backend developer": ["python", "java", "database"],
    "full stack developer": ["javascript", "node.js"],
    "golang developer": ["go"],
    "python developer": ["python"],
    "game developer": ["c++", "unity"],
    "wordpress": ["wordpress"],
    "devops engineer": ["docker", "kubernetes"],
    "mobile developer": ["flutter", "react native"]
}

def load_data():
    """Load resume and job data"""
    resume_df = pd.read_csv('resume_skills_output.csv')
    job_df = pd.read_csv('internships (2).csv')
    resume_skills = resume_df.iloc[:, 0].dropna().str.strip().str.lower().tolist()
    resume_description = resume_df.iloc[1, 1]
    return resume_skills, resume_description, job_df

def clean_skills(skill_text):
    """Clean and normalize skills text"""
    if pd.isna(skill_text):
        return []
    skill_text = re.sub(r'[\[\]\'\"]', '', str(skill_text))
    return [skill.strip().lower() for skill in skill_text.split(',') if skill.strip()]

def identify_role(title):
    """Flexible role identification with common variations"""
    title = title.lower()
    variations = {
        "wordpress": ["wordpress", "wp developer"],
        "golang": ["golang", "go developer", "go lang"],
        "devops": ["devops", "cloud engineer"],
        "mobile": ["mobile", "flutter", "react native"]
    }
    for role, terms in variations.items():
        if any(term in title for term in terms):
            return role
    for role in CRITICAL_SKILLS:
        if role in title:
            return role
    return None

def calculate_zero_tolerance_similarity(resume_skills, job_skills_text, job_title):
    """Skill-based matching with critical skill requirement"""
    job_skills = clean_skills(job_skills_text)
    role = identify_role(job_title)
    
    if not role:
        if not job_skills:
            return 0
        matches = sum(1 for skill in job_skills if skill in resume_skills)
        return matches / len(job_skills)
    
    critical_skills = CRITICAL_SKILLS.get(role, [])
    for skill in critical_skills:
        if skill not in resume_skills:
            return 0  # Critical skill missing
    
    all_skills = critical_skills + job_skills
    if not all_skills:
        return 0
    matches = sum(1 for skill in all_skills if skill in resume_skills)
    return matches / len(all_skills)

def calculate_description_similarity(resume_description, job_descriptions):
    """Description-based semantic similarity using sentence transformer"""
    resume_embedding = model.encode(resume_description, convert_to_tensor=True)
    job_embeddings = model.encode(job_descriptions.tolist(), convert_to_tensor=True)
    cosine_scores = util.cos_sim(resume_embedding, job_embeddings)
    return cosine_scores[0].cpu().numpy()

def main():
    resume_skills, resume_description, job_df = load_data()
    
    # Initialize similarity columns with default values
    job_df['Skill Similarity Percentage'] = 0.0
    job_df['Description Similarity Percentage'] = 0.0
    
    # Check available columns
    has_skills_column = 'Skills' in job_df.columns
    has_description_column = 'Description' in job_df.columns
    has_job_description_column = 'Job Description' in job_df.columns
    
    # Skill-based similarity (if available)
    if has_skills_column:
        job_df['Skill Similarity Score'] = job_df.apply(
            lambda row: calculate_zero_tolerance_similarity(
                resume_skills,
                str(row['Skills']),
                str(row['Title'])
            ),
            axis=1
        )
        job_df['Skill Similarity Percentage'] = (job_df['Skill Similarity Score'] * 100).round(2)
    
    # Description-based similarity (handle multiple possible column names)
    if has_description_column and has_job_description_column:
        # If both columns exist, ask user which one to use
        print("\nFound both 'Description' and 'Job Description' columns in the data.")
        choice = input("Which column should be used for description similarity? (1 for Description, 2 for Job Description): ")
        if choice == '1':
            description_column = 'Description'
        else:
            description_column = 'Job Description'
    elif has_description_column:
        description_column = 'Description'
    elif has_job_description_column:
        description_column = 'Job Description'
    else:
        raise ValueError("No description column found (neither 'Description' nor 'Job Description' exists in the data)")
    
    print(f"\nUsing column '{description_column}' for description similarity calculations")
    job_df['Description Similarity Score'] = calculate_description_similarity(
        resume_description, 
        job_df[description_column]
    )
    job_df['Description Similarity Percentage'] = (job_df['Description Similarity Score'] * 100).round(2)
    
    # Final Combined Score - adjust weights based on available columns
    if has_skills_column:
        # 50% skill, 50% description when skills are available
        job_df['Final Score'] = (job_df['Skill Similarity Percentage'] + job_df['Description Similarity Percentage']) / 2
    else:
        # 100% description when skills are not available
        job_df['Final Score'] = job_df['Description Similarity Percentage']
    
    job_df['Final Score'] = job_df['Final Score'].round(2)
    
    # Save final results
    job_df_sorted = job_df.sort_values(by='Final Score', ascending=False)
    job_df_sorted.to_csv('final_combined_job_matches.csv', index=False)
    
    # Display results
    print("\nTOP JOB MATCHES BASED ON FINAL SCORE:")
    columns_to_show = ['Title', 'Description Similarity Percentage', 'Final Score']
    if has_skills_column:
        columns_to_show.insert(1, 'Skill Similarity Percentage')
    print(job_df_sorted[columns_to_show])

if __name__ == "__main__":
    main()



Enter 1 for PDF input, 2 for Text input:  1
Enter the path to the PDF file:  My_resume_2_f (1).pdf


✅ Extraction completed! Results saved to resume_skills_output.csv

Using column 'Description' for description similarity calculations

TOP JOB MATCHES BASED ON FINAL SCORE:
                                   Title  Skill Similarity Percentage  \
11                      Machine Learning                       100.00   
6                       Machine Learning                       100.00   
16          Artificial Intelligence (AI)                       100.00   
27          Artificial Intelligence (AI)                       100.00   
33              Python And AI Instructor                       100.00   
7           Artificial Intelligence (AI)                        83.33   
2           Artificial Intelligence (AI)                       100.00   
26          Artificial Intelligence (AI)                        87.50   
10              Data Science/AI Engineer                        75.00   
28                 Algorithm Development                        80.00   
5                      C