In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

# Load datasets
candidates_df = pd.read_csv('/Users/harshrajmishra/Documents/SIH 24/Final_Candidates_Dataset.csv')
print("candidates dataset")
print(candidates_df.head(10))

experts_df = pd.read_csv('/Users/harshrajmishra/Documents/SIH 24/Final_Expert_Dataset.csv')
print("Expert dataset")
print(experts_df.head(10))

jobs_df = pd.read_csv('/Users/harshrajmishra/Documents/SIH 24/Job_dataset.csv')
print("Jobs dataset")
print(jobs_df.head(10))

# Data Cleaning
# Handling potential inconsistencies in the Jobs Dataset (Skills column)
def clean_skills(skills):
    if isinstance(skills, str) and skills.startswith('['):
        return ','.join(ast.literal_eval(skills))
    return skills

jobs_df['Skills Required'] = jobs_df['Skills Required'].apply(clean_skills)

# Data Cleaning completed
print("Data Cleaning completed.\n")

# Feature Engineering
label_encoder = LabelEncoder()

# Skip label encoding for Domains
candidates_df['Education'] = label_encoder.fit_transform(candidates_df['Education'])
candidates_df['Domains'] = candidates_df['Domains'].apply(lambda x: ','.join(x.split(',')))

experts_df['Domain'] = experts_df['Domain'].apply(lambda x: ','.join(x.split(',')))
experts_df['Qualifications'] = label_encoder.fit_transform(experts_df['Qualifications'])

# Skip label encoding for Domains in jobs dataset
jobs_df['Education Level'] = label_encoder.fit_transform(jobs_df['Education Level'])

# Standardizing numerical features (e.g., experience, number of skills)
scaler = StandardScaler()

candidates_df['Experience'] = scaler.fit_transform(candidates_df[['Experience']])
candidates_df['Number of Skills'] = scaler.fit_transform(candidates_df[['Number of Skills']])

experts_df['Years of experience'] = scaler.fit_transform(experts_df[['Years of experience']])
experts_df['Expert Number of Skills'] = scaler.fit_transform(experts_df[['Expert Number of Skills']])

jobs_df['Year of Experience'] = scaler.fit_transform(jobs_df[['Year of Experience']])

print("Feature Engineering completed.\n")

# Combine domains back into strings if they are still lists
candidates_df['Domains'] = candidates_df['Domains'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
experts_df['Domain'] = experts_df['Domain'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)

print("Feature Engineering and Domain Correction completed.\n")






candidates dataset
   ID                                     Candidate Name      Education  \
0   1                                Ananta Prasad Sethi  Undergraduate   
1   2                 Nayakpara Hiteshbhai Bhagvanjibhai  Undergraduate   
2   3                                         Raj Kishor  Undergraduate   
3   4              Mahender Singh Alias Swami Chetnanand  Undergraduate   
4   5                             Kambale Devadas Govind  Undergraduate   
5   6                          Kumar Shantilal Virjibhai  Undergraduate   
6   7                            Konda Vishweshwar Reddy  Undergraduate   
7   8                                    Dr. Ramji Singh         Master   
8   9                                             Ialiya  Undergraduate   
9  10  Koradia Chandreshbhai Valjibhai (Chandresh Patel)  Undergraduate   

   Experience                                             Skills  \
0           7  Cryptography,Cyber Defense,Simulation & Modeli...   
1           2  Guid

In [2]:
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import jaccard_score
from scipy.stats import pearsonr
import pandas as pd
import numpy as np

# Function to preprocess text
def preprocess_text(text):
    if isinstance(text, (list, tuple)):
        text = ','.join(text)
    if isinstance(text, str):
        return text.lower().replace(",", " ").replace("&", "and").replace("-", " ")
    return str(text)

# Preprocess the text fields in all datasets
candidates_df['Skills'] = candidates_df['Skills'].apply(preprocess_text)
candidates_df['Domains'] = candidates_df['Domains'].apply(preprocess_text)
experts_df['Domain skills'] = experts_df['Domain skills'].apply(preprocess_text)
experts_df['Domain'] = experts_df['Domain'].apply(preprocess_text)
jobs_df['Skills Required'] = jobs_df['Skills Required'].apply(preprocess_text)
jobs_df['Domain'] = jobs_df['Domain'].apply(preprocess_text)

# Combine all skills and domain data for fitting the vectorizers
all_skills = pd.concat([candidates_df['Skills'], experts_df['Domain skills'], jobs_df['Skills Required']])
all_domains = pd.concat([candidates_df['Domains'], experts_df['Domain'], jobs_df['Domain']])

# Ensure all entries in all_domains are strings
all_domains = all_domains.astype(str)

# Fit vectorizers on the combined data
vectorizer = TfidfVectorizer().fit(all_skills)
domain_vectorizer = TfidfVectorizer().fit(all_domains)

# Transform the relevant fields using the fitted vectorizers
candidate_skills_tfidf = vectorizer.transform(candidates_df['Skills'])
expert_skills_tfidf = vectorizer.transform(experts_df['Domain skills'])
job_skills_tfidf = vectorizer.transform(jobs_df['Skills Required'])

candidate_domains_tfidf = domain_vectorizer.transform(candidates_df['Domains'])
expert_domains_tfidf = domain_vectorizer.transform(experts_df['Domain'])
job_domains_tfidf = domain_vectorizer.transform(jobs_df['Domain'])

# Normalize experience and education features to a [0, 1] range
scaler = MinMaxScaler()

# Combine the experience columns for fitting the scaler
combined_experience = pd.concat([candidates_df['Experience'], experts_df['Years of experience']])
scaler.fit(combined_experience.values.reshape(-1, 1))

# Transform the experience data for both candidates and experts
candidates_df['Experience'] = scaler.transform(candidates_df[['Experience']])
experts_df['Years of experience'] = scaler.transform(experts_df[['Years of experience']])

# Mapping qualifications to numerical values
def map_qualifications(qual):
    if qual == 'PhD':
        return 1
    elif qual == 'Masters':
        return 0.8
    else:
        return 0.5  # Assign a default lower value for any other qualifications if they exist

# Apply the mapping to the Qualifications column
experts_df['Qualifications'] = experts_df['Qualifications'].apply(map_qualifications)

# Function to calculate Jaccard Similarity
def jaccard_similarity(vector1, vector2):
    return jaccard_score(vector1.toarray().flatten() > 0, vector2.toarray().flatten() > 0)

# Function to calculate Pearson Correlation Similarity
def pearson_correlation(vector1, vector2):
    return pearsonr(vector1.toarray().flatten(), vector2.toarray().flatten())[0]

# Custom similarity function
def custom_similarity(vector1, vector2, job_vector):
    cosine_sim = cosine_similarity(vector1, vector2).flatten()[0]
    jaccard_sim = jaccard_similarity(vector1, vector2)
    skill_overlap = np.dot(vector1.toarray(), job_vector.toarray().T)[0][0]
    custom_sim = 0.2 * skill_overlap + 0.6 * cosine_sim + 0.2 * jaccard_sim
    return custom_sim

# Function to calculate ensemble similarity
def ensemble_similarity(candidate_vector, expert_vector, job_vector):
    cosine_sim = cosine_similarity(candidate_vector, expert_vector).flatten()[0]
    jaccard_sim = jaccard_similarity(candidate_vector, expert_vector)
    pearson_sim = pearson_correlation(candidate_vector, expert_vector)
    custom_sim = custom_similarity(candidate_vector, expert_vector, job_vector)

    ensemble_sim = (0.4 * cosine_sim + 0.2 * jaccard_sim + 0.2 * pearson_sim + 0.2 * custom_sim)
    return ensemble_sim

# Function to calculate relevancy score with proper normalization
def calculate_relevancy(expert_row):
    composite_similarity_weight = 0.4
    previous_interviews_weight = 0.2
    qualifications_weight = 0.1
    publications_weight = 0.1
    communication_weight = 0.1
    professionality_weight = 0.1
    
    relevancy_score = (
        composite_similarity_weight * expert_row['Composite Similarity'] +
        previous_interviews_weight * (expert_row['Previous interviews'] / 5) +  # Normalized 
        qualifications_weight * expert_row['Qualifications'] +  # Already 1 for PhD
        publications_weight * (expert_row['Publications'] / 25) +  # Normalized by max value 25
        communication_weight * (expert_row['Communication'] / 10) +  # Normalized by max value 10
        professionality_weight * (expert_row['Professionality'] / 10)  # Normalized by max value 10
    )
    
    return relevancy_score * 10  # Scaling to bring the score to a more prominent range

# Adjust display settings to avoid truncation
pd.set_option('display.max_colwidth', None)

# Match candidates with a board of experts based on a selected job
job_id = 1
job = jobs_df.iloc[job_id]
job_domain = job['Domain']
job_domains_set = set(job_domain.split())

best_boards = []
for index, candidate in candidates_df.iterrows():
    candidate_domains_set = set(candidate['Domains'].split())
    candidate_skills_set = set(candidate['Skills'].split())
    
    if job_domains_set.issubset(candidate_domains_set):
        candidate_skills = vectorizer.transform([candidate['Skills']])
        candidate_domains = domain_vectorizer.transform([candidate['Domains']])
        job_skills = vectorizer.transform([job['Skills Required']])
        job_domains = domain_vectorizer.transform([job['Domain']])

        candidate_expert_similarities = []
        for i in range(experts_df.shape[0]):
            # Ensemble similarity for both job and candidate similarities
            skill_sim_candidate = ensemble_similarity(candidate_skills, expert_skills_tfidf[i], job_skills)
            domain_sim_candidate = ensemble_similarity(candidate_domains, expert_domains_tfidf[i], job_domains)
            skill_sim_job = ensemble_similarity(job_skills, expert_skills_tfidf[i], job_skills)
            domain_sim_job = ensemble_similarity(job_domains, expert_domains_tfidf[i], job_domains)
            
            similarity_dict = {
                'Expert Name': experts_df.iloc[i]['Expert name'],
                'Expert-Job Domain Similarity': domain_sim_job,
                'Expert-Job Skill Similarity': skill_sim_job,
                'Candidate-Expert Domain Similarity': domain_sim_candidate,
                'Candidate-Expert Skill Similarity': skill_sim_candidate,
                'Composite Similarity': 0.25 * skill_sim_job + 0.25 * domain_sim_job + 0.25 * skill_sim_candidate + 0.25 * domain_sim_candidate,
                'Previous interviews': experts_df.iloc[i]['Previous interviews'],
                'Qualifications': experts_df.iloc[i]['Qualifications'],
                'Publications': experts_df.iloc[i]['Publications'],
                'Communication': experts_df.iloc[i]['Communication'],
                'Professionality': experts_df.iloc[i]['Professionality'],
            }
            similarity_dict['Relevancy Score'] = calculate_relevancy(similarity_dict)
            candidate_expert_similarities.append(similarity_dict)

        expert_df = pd.DataFrame(candidate_expert_similarities)
        expert_df = expert_df.sort_values(by='Composite Similarity', ascending=False)

        for top_n in range(20, 41, 10):  # Start with top 20 and go up to 40, incrementing by 10
            top_experts_indices = expert_df.index[:top_n].tolist()

            print(f"\nTop {top_n} Experts based on similarities:")
            print(expert_df.head(top_n)[['Expert Name', 'Composite Similarity', 'Relevancy Score']])

            for expert_combination in combinations(top_experts_indices, 3):
                combined_skills = " ".join([experts_df.iloc[expert_idx]['Domain skills'] for expert_idx in expert_combination])
                combined_domains = " ".join([experts_df.iloc[expert_idx]['Domain'] for expert_idx in expert_combination])

                if candidate_skills_set.issubset(set(combined_skills.split())) and candidate_domains_set.issubset(set(combined_domains.split())):
                    skill_sim = ensemble_similarity(candidate_skills, vectorizer.transform([combined_skills]), job_skills)
                    domain_sim = ensemble_similarity(candidate_domains, domain_vectorizer.transform([combined_domains]), job_domains)

                    if skill_sim > 0.9 and domain_sim > 0.9:
                        board_relevancy_score = expert_df.loc[list(expert_combination), 'Relevancy Score'].mean()
                        best_boards.append((expert_combination, skill_sim, domain_sim, combined_skills, combined_domains, board_relevancy_score))
            
            if best_boards:
                break  # Break out of loop if a valid group is found

        if not best_boards:
            print(f"No valid expert group could be formed for job ID {job_id} with the current top expert limits.")

        best_boards = sorted(best_boards, key=lambda x: (x[1] + x[2]) / 2, reverse=True)[:3]

        result_df = pd.DataFrame([{
            'Expert Names': ", ".join(experts_df.iloc[list(board)]['Expert name'].values),
            'Candidate Name': candidate['Candidate Name'],
            'Job Type': job['Job Type'],
            'Skill Similarity': skill_sim,
            'Domain Similarity': domain_sim,
            'Composite Similarity': (skill_sim + domain_sim) / 2,
            'Relevance Score': board_relevancy_score,  # Relevance score is directly included
            'Group Skills': combined_skills,
            'Group Domains': combined_domains,
            'Candidate Skills': candidate['Skills'],
            'Candidate Domains': candidate['Domains']
        } for board, skill_sim, domain_sim, combined_skills, combined_domains, board_relevancy_score in best_boards])

        break
    else:
        print(f"Candidate '{candidate['Candidate Name']}' does not match the job domain requirements.")
        missing_domains = job_domains_set - candidate_domains_set
        print(f"Missing domains: {', '.join(missing_domains)}\n")

if best_boards:
    print("\nTop 3 Boards of Experts:")
    print(result_df)
else:
    print("No candidates matched the job domain requirements.")




Candidate 'Ananta Prasad Sethi' does not match the job domain requirements.
Missing domains: missile, systems, strategic


Top 20 Experts based on similarities:
               Expert Name  Composite Similarity  Relevancy Score
19    Siddharth Srivastava              0.817147         8.268589
270         Dr. Bina Mehta              0.815034         9.060135
111     Dr. Sakshi Agarwal              0.815034         8.060135
120       Dr. Himanshu Roy              0.814064         9.256254
10          Dr. Kavya Iyer              0.788190         8.752758
98             Yash Chopra              0.742433         6.529733
90      Dr. Rohan Aggarwal              0.742433        10.969733
161          Srishti Tomar              0.738727         7.594907
268          Pallavi Gupta              0.680703         7.602813
54          Ananya Dwivedi              0.653383         7.433531
255       Chaitanya Mittal              0.645262         6.541048
44     Dr. Nikhil Aggarwal              0.64191