In [51]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix

In [52]:
# pip install pypdf

In [53]:
# pip install python-docx

In [54]:
from pypdf import PdfReader
from docx import Document  # For handling Word files

import os
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from ipywidgets import widgets, VBox, HBox, Output, HTML, FloatText, Button
from IPython.display import display, clear_output
from ipywidgets import widgets, FileUpload, Button, Output, VBox

In [55]:
directory_path = os.getcwd()
uploaded_df = pd.DataFrame(columns=['Document Name', 'Uploaded By']) 
output = Output()  

In [56]:
def list_existing_documents(folder):
    global uploaded_df
    documents = [file for file in os.listdir(folder) if file.endswith(('.pdf', '.docx'))]
    
    if len(uploaded_df) == 0:
        uploaded_df = pd.DataFrame({'Document Name': documents, 'Uploaded By': ['Unknown'] * len(documents)})
    else:
        existing_files = set(uploaded_df['Document Name'])
        new_files = [file for file in documents if file not in existing_files]
        new_rows = pd.DataFrame({'Document Name': new_files, 'Uploaded By': ['Unknown'] * len(new_files)})
        uploaded_df = pd.concat([uploaded_df, new_rows], ignore_index=True)
    return uploaded_df

def update_table():
    with output:
        output.clear_output()
        display(uploaded_df)

list_existing_documents(directory_path)
update_table()

### Button
upload_widget = FileUpload(accept=".pdf, .docx", multiple=True)
username_input = widgets.Text(description="Uploaded By:", placeholder="Enter your name")
upload_button = Button(description="Upload Files", button_style='success')

def on_upload_click(b):
    global uploaded_df
    if not username_input.value.strip():
        with output:
            output.clear_output()
            print("Please enter your name before uploading!")
        return
    for fileinfo in upload_widget.value:
        filename = fileinfo['name']
        filepath = os.path.join(directory_path, filename)
        with open(filepath, 'wb') as f:
            f.write(fileinfo['content'])
        new_row = {'Document Name': filename, 'Uploaded By': username_input.value.strip()}
        uploaded_df = pd.concat([uploaded_df, pd.DataFrame([new_row])], ignore_index=True)
    with output:
        clear_output()
        print("Files uploaded successfully!")
        display(uploaded_df)

upload_button.on_click(on_upload_click)

In [57]:
# app_layout = VBox([
#     widgets.HTML("<h3>Uploaded Documents</h3>"),
#     output,
#     widgets.HTML("<h4>Upload New Files</h4>"),
#     username_input,
#     upload_widget,
#     upload_button
# ])

# display(app_layout)

In [58]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [59]:
from ipywidgets import widgets
from IPython.display import display, clear_output

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
import re

In [61]:
# Function to extract text from all pages of a PDF
def extract_text_from_pdf(pdf_path):
    # Create a PDF reader object
    reader = PdfReader(pdf_path)
    # Initialize a list to store text from pages
    text_collection = []
    # Loop through the first two pages (or less if fewer pages exist)
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text_collection.append(page.extract_text())
    # Combine all extracted text into one string
    combined_text = "\n".join(text_collection)
    return combined_text


# Function to extract text from a Word file
def extract_text_from_word(word_path):
    doc = Document(word_path)
    text_collection = [para.text for para in doc.paragraphs if para.text.strip()]  # Get non-empty paragraphs
    return "\n".join(text_collection)

# Function to clean resume text
def clean_text(text):
    text = re.sub('httpS+s*', ' ', text)  # remove URLs
    text = re.sub('#S+', '', text)  # remove hashtags
    text = re.sub('@S+', '  ', text)  # remove mentions
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[]^_`{|}~"""), ' ', text)  # remove punctuations
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

# Function to extract candidate name from the file name
def extract_candidate_name(file_name):
    name_without_ext = os.path.splitext(file_name)[0]
    name_cleaned = re.sub(r'[_\-]+', ' ', name_without_ext)
    name_cleaned = re.sub(r'\b(resume|cv|profile|nov\d{4}|doc)\b', '', name_cleaned, flags=re.IGNORECASE)
    name_cleaned = ' '.join(word.capitalize() for word in name_cleaned.split())
    return name_cleaned

# Function to filter invalid n-grams
def is_valid_phrase(phrase):
    # Exclude phrases with numbers, dates, or non-alphabetic characters
    if re.search(r'\d', phrase):  # Contains any digit
        return False
    if re.search(r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|gpa)\b', phrase, re.IGNORECASE):
        return False  # Contains months or 'gpa'
    if len(phrase) < 3:  # Ignore very short phrases
        return False
    return True

# Function to extract top N keywords or phrases and their frequencies
def extract_key_skills_with_freq(text, n=10, ngram_range=(1, 3)):
    # Initialize CountVectorizer to extract words or phrases
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english', min_df=1)
    X = vectorizer.fit_transform([text])  # Fit-transform the given text
    # Get word/phrase frequencies
    freq = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
    sorted_freq = sorted(freq, key=lambda x: x[1], reverse=True)  # Sort by frequency
    # Filter and extract valid top N phrases with frequencies
    filtered_skills = {word: count for word, count in sorted_freq if is_valid_phrase(word)}
    # Limit to top N skills
    top_n_skills = dict(list(filtered_skills.items())[:n])
    return top_n_skills

# Function to calculate keyword matches for each category
def categorize_resume(text, terms):
    category_scores = {category: 0 for category in terms.keys()}
    for category, keywords in terms.items():
        for keyword in keywords:
            # Use regex to find complete word matches (case insensitive)
            matches = re.findall(r'\b' + re.escape(keyword) + r'\b', text.lower())
            category_scores[category] += len(matches)
    return category_scores

In [62]:
def process_resumes(directory_path, terms, n_skills=10, ngram_range=(2, 3)):
    
    global uploaded_df  # Use global variable to store results
    resume_data = []

    for idx, file_name in enumerate(os.listdir(directory_path)):
        file_path = os.path.join(directory_path, file_name)
        
        
        if not (file_name.endswith('.pdf') or file_name.endswith('.docx')):
            continue
        if file_name.endswith('.pdf'):
            resume_text = extract_text_from_pdf(file_path)
        elif file_name.endswith('.docx'):
            resume_text = extract_text_from_word(file_path)
        
        cleaned_text = clean_text(resume_text)
        
        candidate_name = extract_candidate_name(file_name)

        key_skills = extract_key_skills_with_freq(cleaned_text, n=n_skills, ngram_range=ngram_range)
        
        category_scores = categorize_resume(cleaned_text, terms)
        expertise_area = max(category_scores, key=category_scores.get)        

        resume_data.append({
            "ID": idx + 1,
            "File Name": file_name,
            "Candidate Name": candidate_name,
            "Resume": resume_text,
            "cleaned_resume": cleaned_text,
            "key skills": key_skills,
            "Expertise Area": expertise_area,
            "Category Scores": category_scores
        })
    
    
    return pd.DataFrame(resume_data)

In [63]:
terms = {
    'Quality/Six Sigma': [
        'black belt', 'capability analysis', 'control charts', 'doe', 'dmaic', 'fishbone',
        'gage r&r', 'green belt', 'ishikawa', 'iso', 'kaizen', 'kpi', 'lean', 'metrics',
        'pdsa', 'performance improvement', 'process improvement', 'quality',
        'quality circles', 'quality tools', 'root cause', 'six sigma', 'stability analysis',
        'statistical analysis', 'tqm'
    ],
    'Operations Management': [
        'automation', 'bottleneck', 'constraints', 'cycle time', 'efficiency', 'fmea',
        'machinery', 'maintenance', 'manufacture', 'line balancing', 'oee', 'operations',
        'operations research', 'optimization', 'overall equipment effectiveness', 'pfmea',
        'process', 'process mapping', 'production', 'resources', 'safety', 'stoppage',
        'value stream mapping', 'utilization'
    ],
    'Supply Chain': [
        'abc analysis', 'apics', 'customer', 'customs', 'delivery', 'distribution', 'eoq',
        'epq', 'fleet', 'forecast', 'inventory', 'logistic', 'materials', 'outsourcing',
        'procurement', 'reorder point', 'rout', 'safety stock', 'scheduling', 'shipping',
        'stock', 'suppliers', 'third party logistics', 'transport', 'transportation',
        'traffic', 'supply chain', 'vendor', 'warehouse', 'wip', 'work in progress'
    ],
    'Project Management': [
        'administration', 'agile', 'budget', 'cost', 'direction', 'feasibility analysis',
        'finance', 'kanban', 'leader', 'leadership', 'management', 'milestones', 'planning',
        'pmi', 'pmp', 'problem', 'project', 'risk', 'schedule', 'scrum', 'stakeholders'
    ],
    'Data Analytics': [
        'analytics', 'api', 'aws', 'big data', 'business intelligence', 'clustering', 'code',
        'coding', 'data', 'database', 'data mining', 'data science', 'deep learning', 'hadoop',
        'hypothesis test', 'iot', 'internet', 'machine learning', 'modeling', 'nosql', 'nlp',
        'predictive', 'programming', 'python', 'r', 'sql', 'tableau', 'text mining',
        'visualization'
    ],
    'Healthcare': [
        'adverse events', 'care', 'clinic', 'cphq', 'ergonomics', 'healthcare',
        'health care', 'health', 'hospital', 'human factors', 'medical', 'near misses',
        'patient', 'reporting system'
    ],
    'Cloud': [
        'aws', 'azure', 'gcp', 'cloud computing', 'cloud architecture', 'cloud deployment', 'cloud services'
    ],
    'Software Development': [
        'programming', 'coding', 'software engineering', 'software design', 'agile', 'scrum', 
        'kanban', 'development lifecycle', 'sdlc', 'api integration', 'debugging'
    ],
    'Visualization Board': [
        'tableau', 'power bi', 'visualization', 'dashboards', 'data visualization', 
        'kpi dashboards', 'reporting tools', 'data charts'
    ],
    'Process or Flow Automation': [
        'process mapping', 'process automation', 'workflow automation', 'rpa', 
        'bottleneck analysis', 'value stream mapping', 'lean automation', 'optimization', 
        'cycle time reduction'
    ],
    'Database': [
        'sql', 'nosql', 'mysql', 'postgresql', 'oracle', 'database design', 
        'database optimization', 'mongodb', 'data storage', 'data warehousing'
    ],
    'Machine Learning and Modelling': [
        'machine learning', 'deep learning', 'predictive modeling', 'clustering', 
        'supervised learning', 'unsupervised learning', 'neural networks', 'regression analysis', 
        'model validation', 'feature engineering', 'reinforcement learning', 'time series modeling'
    ]
}

In [64]:
### Button
process_button = Button(description="Process Files", button_style='info')

def on_process_click(b):
    global uploaded_df
    with output:
        clear_output()
        print("Processing files...")
        uploaded_df = process_resumes(directory_path, terms, n_skills=10, ngram_range=(2, 3))
        print("Uploaded files have been processed. Here's the extracted data:")
        display(uploaded_df)

process_button.on_click(on_process_click)

In [65]:
# app = VBox([
#     widgets.HTML("<h3>Process and Analyze Uploaded Resumes</h3>"),
#     process_button,
#     output
# ])

# display(app)

In [66]:
def visualize_data(df, id_value=None, name_value=None, visualization_type='Key Skills'):
    if id_value is not None:
        row = df[df['ID'] == id_value]
    elif name_value is not None:
        row = df[df['Candidate Name'].str.lower() == name_value.lower()]
    else:
        print("Please provide either an ID or a Candidate Name.")
        return
    
    if row.empty:
        if id_value is not None:
            print(f"No record found for ID {id_value}")
        else:
            print(f"No record found for Candidate Name '{name_value}'")
        return
    
    candidate_name = row['Candidate Name'].iloc[0]
    
    if visualization_type == 'Key Skills':
        key_skills = row['key skills'].iloc[0]
        
        # Bar chart for key skills
        plt.figure(figsize=(10, 6))
        sns.barplot(x=list(key_skills.values()), y=list(key_skills.keys()), palette="viridis")
        plt.title(f"Key Skills for {candidate_name} (Bar Chart)", fontsize=16)
        plt.xlabel("Frequency", fontsize=12)
        plt.ylabel("Skills", fontsize=12)
        plt.show()
        
        # Pie chart for key skills
        plt.figure(figsize=(8, 8))
        plt.pie(
            key_skills.values(), 
            labels=key_skills.keys(), 
            autopct='%1.1f%%', 
            startangle=140, 
            colors=sns.color_palette('viridis', len(key_skills))
        )
        plt.title(f"Key Skills for {candidate_name} (Pie Chart)", fontsize=16)
        plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular
        plt.show()
    
    elif visualization_type == 'Category Scores':
        category_scores = row['Category Scores'].iloc[0]
        
        # Bar chart for category scores
        plt.figure(figsize=(10, 6))
        sns.barplot(x=list(category_scores.values()), y=list(category_scores.keys()), palette="magma")
        plt.title(f"Category Scores for {candidate_name} (Bar Chart)", fontsize=16)
        plt.xlabel("Scores", fontsize=12)
        plt.ylabel("Categories", fontsize=12)
        plt.show()
        
        # Pie chart for category scores
        plt.figure(figsize=(8, 8))
        plt.pie(
            category_scores.values(), 
            labels=category_scores.keys(), 
            autopct='%1.1f%%', 
            startangle=140, 
            colors=sns.color_palette('magma', len(category_scores))
        )
        plt.title(f"Category Scores for {candidate_name} (Pie Chart)", fontsize=16)
        plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular
        plt.show()
    else:
        print(f"Visualization type '{visualization_type}' is not supported.")

In [67]:
id_input = widgets.IntText(description="Enter ID:")
name_input = widgets.Text(description="Enter Name:")
visualization_dropdown = widgets.Dropdown(
    options=['Key Skills', 'Category Scores'],
    description="Visualize:",
    value='Key Skills'
)

### Button
visualization_button = widgets.Button(description="Visualize", button_style='warning')


def on_visualize_click(b):
    clear_output(wait=True)
    display(id_input, name_input, visualization_dropdown, visualization_button)
    
    id_value = id_input.value if id_input.value else None
    name_value = name_input.value if name_input.value else None
    visualization_type = visualization_dropdown.value
    
    visualize_data(uploaded_df, id_value=id_value, name_value=name_value, visualization_type=visualization_type)

visualization_button.on_click(on_visualize_click)

In [68]:
# app = VBox([
#     widgets.HTML("<h3>Query people's skills by candidate ID or name</h3>"),
#     id_input,
#     name_input,
#     visualization_dropdown,
#     visualization_button
# ])

# display(app)

In [69]:
# Function to count the frequency of a skill in the cleaned_resume column

def count_skill_frequency(df, skill):
    skill_lower = skill.lower()  # Convert skill to lowercase for case-insensitive matching
    # Count skill frequency for each row
    df[f"Frequency of '{skill}'"] = df['cleaned_resume'].apply(
        lambda x: len(re.findall(r'\b' + re.escape(skill_lower) + r'\b', x.lower()))
    )
    return df

### Button
# Create input box and button for user interaction
skill_input = widgets.Text(description="Enter Skill:", button_style='primary')
skill_button = widgets.Button(description="Count Skill", button_style='warning')

# Button click event handler
def on_skill_click(b):
    clear_output(wait=True)
    display(skill_input, skill_button)
    
    # Get user input skill
    skill = skill_input.value.strip()
    if not skill:
        print("Please enter a skill!")
        return
    
    # Count frequency of the skill
    updated_df = count_skill_frequency(uploaded_df, skill)
    display(updated_df)
    
# Bind button click event
skill_button.on_click(on_skill_click)

In [70]:
app = VBox([
    widgets.HTML("<h3>Input a skill you are looking for</h3>"),
    skill_input,
    skill_button
])

display(app)

############################3

VBox(children=(HTML(value='<h3>Input a skill you are looking for</h3>'), Text(value='', description='Enter Ski…

In [71]:
# sorting by one selected skill

def get_frequency_columns(df):
    return [col for col in df.columns if col.startswith("Frequency of ")]

def sort_and_display(df, selected_skill_column):
    if selected_skill_column in df.columns: 
        sorted_df = df.sort_values(selected_skill_column, ascending=False) 
        display(sorted_df)
    else:
        print(f"No column found for: {selected_skill_column}")

frequency_columns = get_frequency_columns(uploaded_df)

if not frequency_columns:
    print("No frequency columns found in the DataFrame!")
else:
        dropdown = widgets.Dropdown(
        options=frequency_columns,
        description="Select Skill:",
        disabled=False
    )

### Button
sort_button = widgets.Button(description="Sort by Skill", button_style='success')

def on_sort_click(b):
        clear_output(wait=True) 
        display(dropdown, sort_button) 
        selected_skill_column = dropdown.value  
        sort_and_display(uploaded_df, selected_skill_column)  

sort_button.on_click(on_sort_click)

No frequency columns found in the DataFrame!


In [72]:

# app = VBox([
#     widgets.HTML("<h3>Sort all candidates by the proficiency in one selected skill</h3>"),
#     dropdown,
#     sort_button
# ])

# display(app)


In [73]:
# 获取技能列
def get_skill_columns(df):
    """
    获取所有以 'Frequency of ' 开头的技能列名。
    """
    return [col for col in df.columns if col.startswith("Frequency of ")]

# 生成权重输入表单
def input_skills_and_weights(df):
    """
    根据技能列动态生成输入框和按钮。
    
    参数:
        df (pd.DataFrame): 数据框。
    
    返回:
        dict: 动态生成的输入框字典。
        list: 技能列列表。
    """
    skill_columns = get_skill_columns(df)
    if not skill_columns:
        print("No skill frequency columns found!")
        return {}, []

    # 动态生成权重输入框
    skill_inputs = {}
    for skill in skill_columns:
        skill_name = skill.replace("Frequency of ", "").strip("'")
        skill_inputs[skill] = widgets.FloatText(
            description=f"{skill_name} Weight:", 
            min=0, 
            max=100, 
            step=1
        )

    return skill_inputs, skill_columns

# 提交按钮点击事件
def on_submit_click(b):
    """
    处理 Submit Weights 按钮点击事件，计算加权分数。
    """
    with output:
        clear_output()

        # 显示标题和表单
        display(HTML("<h2>Weighted Scores Calculation</h2>"))
        display(uploaded_df)
        display_form(skill_inputs, submit_button)

        # 获取用户输入的权重
        weights = {skill: input_box.value for skill, input_box in skill_inputs.items()}
        
        # 检查总权重是否为 100%
        total_weight = sum(weights.values())
        if total_weight != 100:
            print(f"Total weight must sum to 100%, but got {total_weight:.2f}%. Please try again.")
            return
        
        # 计算加权分数
        uploaded_df['Weighted Score'] = uploaded_df.apply(
            lambda row: sum(row[skill] * (weights[skill] / 100) for skill in skill_columns),
            axis=1
        )
        
        # 对加权分数进行排序并显示
        sorted_df = uploaded_df.sort_values('Weighted Score', ascending=False)
        print("Candidates sorted by weighted score:")
        display(sorted_df)

# 显示输入框和提交按钮
def display_form(skill_inputs, submit_button):
    """
    显示技能权重输入框和提交按钮。
    """
    form_items = [VBox([input_box]) for input_box in skill_inputs.values()]
    display(VBox(form_items + [submit_button], layout=widgets.Layout(margin="20px 0px")))

In [74]:
#第一个模块Candidate Pool and File Upload，这里面展示DataFrame，加入upload_button和process_button。
#第二个模块Key Skills Visualization，这里仍然展示DataFrame，加入visualization_button，用来给用户可视化单独看每个人。
#第三个模块Query Skills，这里仍然展示DataFrame，加入skill_button，sort_button，submit_button，用来给用户输入Skill关键词，比较不同人，并且submit_button可以让用户输入不同权重对于不同已经输入的skill，综合判定不同人

In [75]:
from ipywidgets import widgets, VBox, HBox, Output, HTML, Button, Dropdown, FloatText, Layout
from IPython.display import display, clear_output

In [76]:
header = HTML("<h1 style='text-align: center; margin-bottom: 20px;'>Resume Processing Dashboard</h1>")

navigation_bar = widgets.Select(
    options=["Candidate Pool and File Upload", "Key Skills Visualization", "Query Skills"],
    description="Navigate:",
    layout=widgets.Layout(width="250px", height="300px", margin="20px 0px")
)

upload_widget = widgets.FileUpload(accept=".pdf, .docx", multiple=True)
username_input = widgets.Text(description="Uploaded By:", placeholder="Enter your name")
upload_button = Button(description="Upload Files", button_style='success')
process_button = Button(description="Process Files", button_style='info')


def upload_and_process_files():
    """
    Candidate Pool 和文件上传模块。
    """
    with output:
        clear_output()
        display(HTML("<h2>Candidate Pool</h2>"))
        display(uploaded_df)

        upload_button.on_click(on_upload_click)
        process_button.on_click(on_process_click)

        display(VBox([
            HTML("<h3 style='margin-top: 20px;'>Upload and Process Files</h3>"), 
            username_input, upload_widget, upload_button, process_button
        ], layout=widgets.Layout(margin="20px 0px")))

In [77]:
visualization_button = Button(description="Visualize Key Skills", button_style='warning')

        
def visualize_key_skills():
    """
    Key Skills 可视化模块。
    """
    with output:
        clear_output()
        display(HTML("<h2>Key Skills Visualization</h2>"))
        display(uploaded_df)

        visualization_button.on_click(on_visualize_click)
        display(VBox([
            HTML("<h3 style='margin-top: 20px;'>Visualize Key Skills</h3>"), visualization_button
        ], layout=widgets.Layout(margin="20px 0px")))

In [78]:
def query_skills():

    with output:
        clear_output()
        display(HTML("<h2>Query Skills</h2>"))
        display(uploaded_df)

        # 定义技能按钮事件
        skill_button.on_click(on_skill_click)  # 假设已定义 on_skill_click
        sort_button.on_click(on_sort_click)    # 假设已定义 on_sort_click
        submit_button.on_click(on_submit_click)

        # 显示技能查询相关组件
        display(VBox([
            HTML("<h3 style='margin-top: 20px;'>Query Skills</h3>"), 
            skill_input, 
            skill_button, 
            sort_button, 
            submit_button
        ], layout=widgets.Layout(margin="20px 0px")))

skill_inputs, skill_columns = input_skills_and_weights(uploaded_df)

skill_input = widgets.Text(description="Enter Skill:")
skill_button = widgets.Button(description="Count Skill", button_style='primary')
sort_button = widgets.Button(description="Sort by Skill", button_style='success')
submit_button = widgets.Button(description="Submit Weights", button_style='success')

output = widgets.Output()

No skill frequency columns found!


In [79]:
def on_navigation_change(change):

    clear_output(wait=True)
    display(app_layout)
    page_actions = {
        "Candidate Pool and File Upload": upload_and_process_files,
        "Key Skills Visualization": visualize_key_skills,
        "Query Skills": query_skills
    }
    page_actions[change['new']]()

navigation_bar.observe(on_navigation_change, names='value')

In [80]:
app_layout = HBox([
    VBox([
        HTML("<h2 style='margin-bottom: 20px;'>Navigation</h2>"),
        navigation_bar
    ], layout=widgets.Layout(width="250px", padding="10px", border="1px solid #ccc")),
    VBox([header, output], layout=widgets.Layout(flex="1", padding="20px"))
])

display(app_layout)
upload_and_process_files()

HBox(children=(VBox(children=(HTML(value="<h2 style='margin-bottom: 20px;'>Navigation</h2>"), Select(descripti…

In [81]:
#!pip3 freeze > requirements.txt