In [18]:
import pandas as pd
import spacy
import re

In [19]:
# Load Dataset
df = pd.read_csv('data/train.csv')

# Display the first few rows of the dataframe to ensure it loaded correctly
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [20]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

EDUCATION_KEYWORDS = [

    # Degrees and certifications
    "bachelor", "master", "phd", "associate", "doctorate", "diploma", "high school", "certificate",
    "degree", "undergraduate", "postgraduate", "graduate", "minor", "major", "double major", 
    "honors society", "summa cum laude", "magna cum laude", "cum laude", "certification", "credential",

    # Institutions
    "university", "college", "institute", "academy", "school", "polytechnic", "faculty", 
    "campus", "research center", "study abroad",

    # Courses and programs
    "course", "program", "curriculum", "study", "studying", "degree program", "elective", 
    "core subject", "capstone project", "thesis", "dissertation", "academic research", "extracurricular",

    # Academic achievements
    "graduated", "academic excellence", "gpa", "grade", "class rank", "dean's list", 
    "scholarship", "fellowship", "merit", "achievement", "distinction", "recognition",

    # Time and status descriptors
    "currently enrolled", "alumni", "attended", "completed", 
    "ongoing", "expected graduation", "year of graduation", "semester", "term", "academic year",

    # Miscellaneous
    "education", "academic background", "learning", "studies", "knowledge",
    "honor society", "exchange program", "dual degree"
]

EXPERIENCE_KEYWORDS = [
    # General experience descriptors
    "experience", "worked", "managed", "developed", "created", "handled", "led", "oversaw",
    "contributed", "collaborated", "participated", "designed", "executed", "organized", 
    "facilitated", "coordinated", "supported", "implemented", "improved", "streamlined", 
    "enhanced", "optimized", "delivered", "achieved", "initiated", "produced", "generated", 
    "completed", "launched", "innovated", "deployed", "maintained", "supervised", 
    "trained", "mentored", "consulted", "analyzed", "researched", "tested", "debugged",
    
    # Roles and titles
    "manager", "developer", "engineer", "analyst", "specialist", "director", "consultant", 
    "intern", "supervisor", "executive", "coordinator", "technician", "administrator", 
    "advisor", "officer", "architect", "strategist", "trainer", "representative", "leader", 
    "assistant", "associate", "planner", "marketer", "writer", "editor", "programmer", 
    "designer", "scientist", "operator", "controller",

    # Time-related keywords
    "year", "years", "month", "months", "week", "weeks", "from", "to", "since", "until", 
    "duration", "tenure", "period", "timeline", "dates", "date range", "previous", 
    "current", "recent", "ongoing", "temporary", "contract",

    # Organizational descriptors
    "company", "organization", "firm", "agency", "startup", "enterprise", "corporation", 
    "nonprofit", "foundation", "institution", "business", "team", "department", 
    "division", "branch", "entity", "unit", "network", "group",

    # Contextual achievements and contributions
    "milestone", "goal", "target", "results", "success", "impact", "growth", "revenue", 
    "profit", "sales", "market", "cost", "budget", "savings", "efficiency", "effectiveness", 
    "change", "transformation", "metrics", "KPIs", "performance", "improvement", 
    "benchmarks", "compliance", "expansion", "quality", "scalability", "standardization",

    # Work context
    "internship", "project", "assignment", "contract", "engagement", "initiative", 
    "program", "campaign", "task", "responsibility", "challenge", "role", "position", 
    "title", "occupation", "career", "job", "role", "scope", "objective"
]


In [21]:
# Define regex patterns for full keyword extraction
EDUCATION_PATTERN = re.compile(
    r'(\b(?:GPA|degree|bachelor|master|phd|associate|university|college|school|diploma|certificate|course)\b[\w\s\-\.:,]*)',
    re.IGNORECASE
)
EXPERIENCE_PATTERN = re.compile(
    r'(\b(?:years experience|months experience|internship|worked at|project|manager|supervisor|employment|job|leadership)\b[\w\s\-\.:,]*)',
    re.IGNORECASE
)

In [22]:
# Function to extract sentences containing education-related terms
def extract_education_sentences(text):
    sentences = [sent.text.strip() for sent in nlp(text).sents if EDUCATION_PATTERN.search(sent.text)]
    return sentences

# Function to extract sentences containing experience-related terms
def extract_experience_sentences(text):
    sentences = [sent.text.strip() for sent in nlp(text).sents if EXPERIENCE_PATTERN.search(sent.text)]
    return sentences

# Function to extract full keywords or phrases for education
def extract_full_education_keywords(text):
    doc = nlp(text)
    education_phrases = set()
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT', 'NORP', 'FAC', 'WORK_OF_ART', 'EVENT']:
            if EDUCATION_PATTERN.search(ent.text):
                education_phrases.add(ent.text.strip())
    education_phrases.update(re.findall(EDUCATION_PATTERN, text))
    return list(education_phrases)

# Function to extract full keywords or phrases for experience
def extract_full_experience_keywords(text):
    doc = nlp(text)
    experience_phrases = set()
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'WORK_OF_ART', 'PRODUCT', 'EVENT']:
            if EXPERIENCE_PATTERN.search(ent.text):
                experience_phrases.add(ent.text.strip())
    experience_phrases.update(re.findall(EXPERIENCE_PATTERN, text))
    return list(experience_phrases)

In [23]:
# Apply extraction functions
df['education_sentences'] = df['resume_text'].apply(extract_education_sentences)
df['experience_sentences'] = df['job_description_text'].apply(extract_experience_sentences) + df['resume_text'].apply(extract_experience_sentences)
# Apply extraction functions
df['education_full_keywords'] = df['resume_text'].apply(extract_full_education_keywords)
df['experience_full_keywords'] = df['job_description_text'].apply(extract_full_experience_keywords) + df['resume_text'].apply(extract_experience_sentences)


In [24]:
# Create new DataFrame with processed columns
output_df = df[['resume_text', 'job_description_text', 
                'education_full_keywords', 'experience_full_keywords']].copy()

# Rename columns for clarity
output_df.rename(columns={
    'resume_text': 'education',
    'job_description_text': 'experience'
}, inplace=True)

# Save the new output file
output_path = 'eduAndExp.csv'
output_df.to_csv(output_path, index=False)

print(f"Processed results saved to {output_path}")
print(output_df[['education','experience', 'education_full_keywords', 'experience_full_keywords']].head())

Processed results saved to eduAndExp.csv
                                           education  \
0  SummaryHighly motivated Sales Associate with e...   
1  Professional SummaryCurrently working with Cat...   
2  SummaryI started my construction career in Jun...   
3  SummaryCertified Electrical Foremanwith thirte...   
4  SummaryWith extensive experience in business/r...   

                                          experience  \
0  Net2Source Inc. is an award-winning total work...   
1  At Salas OBrien we tell our clients that were ...   
2  Schweitzer Engineering Laboratories (SEL) Infr...   
3  Mizick Miller & Company, Inc. is looking for a...   
4  Life at Capgemini\nCapgemini supports all aspe...   

                             education_full_keywords  \
0  [Economics - Marshall University, GPA, Marshal...   
1  [University,DeKalb,ILGPA:GPA: 3.3Industrial En...   
2  [GPA:Expected in06, School Diploma:Fleming Isl...   
3  [University-Orange,CAGPA:Status-Emphasis in Ci...   
4  [S