### Candidate Details DataFrame

In [43]:
from sre_parse import CATEGORY_NOT_DIGIT
import pandas as pd
import json
import os
def create_candidate_df(json_data,filename,candidate_id):
    row = {
        "candidate_id" : candidate_id,
        "filename": filename,            
        'full_name': json_data['basic_info']['full_name'],              
        'current_title': json_data['basic_info']['current_title'],          
        'email': json_data['contact_info']['email'],                  
        'phone': json_data['contact_info']['phone'],                  
        'city': json_data['contact_info']['address']['city'] if json_data['contact_info']['address'] else None,
        'country': json_data['contact_info']['address']['country'] if json_data['contact_info']['address'] else None,
        'linkedin': json_data['contact_info']['linkedin'],
        'github': json_data['contact_info']['github'],
        'summary': json_data['summary'],
        'total_skills': len(json_data['skills']) if json_data['skills'] is not None else 0,
        'total_experience_years': 0, 
        'education_level': 'Unknown',  
        'created_date': pd.Timestamp.now()
    }
    
    df = pd.DataFrame([row])
    return df

  from sre_parse import CATEGORY_NOT_DIGIT


### Skills DF

In [44]:
def create_skills(json_data,candidate_id):
    skills_data = []
    
    for skill in json_data['skills']:
        skill_row = {
            "candidate_id": candidate_id,
            "skill_name": skill["skill_name"],
            "proficiency": skill["proficiency"],
            "category": skill["category"],
            "years_of_experience": skill["years_of_experience"]
        }
        skills_data.append(skill_row)
    return pd.DataFrame(skills_data)

### Experience DataFrame

In [45]:
def create_experience(json_data,candidate_id):
    experience_data = []
    for experience in json_data['experience']:
        experience_row = {
            "candidate_id": candidate_id,
            "job_title": experience["job_title"],
            "company": experience["company"],
            "start_date": experience["start_date"],
            "end_date": experience["end_date"],
            "employment_type": experience["employment_type"],
            "location": experience["location"],
            "is_current_job" : experience["end_date"] is None,
            "responsibilities_count": len(experience["responsibilities"]) if experience["responsibilities"] is not None else 0,
            "skills_used_count": len(experience["skills_used"]) if experience["skills_used"] is not None else 0,
        }
        experience_data.append(experience_row)
    return pd.DataFrame(experience_data)


### Education DF

In [46]:
def create_education_df(json_data, candidate_id):
    education_data = []
    
    for edu in json_data['education']:
        edu_row = {
            'candidate_id': candidate_id,
            'degree': edu['degree'],
            'field': edu['field'],
            'institution': edu['institution'],
            'start_date': edu['start_date'],
            'end_date': edu['end_date'],
            'grade': edu['grade'],
            'location': edu['location'],
            'courses_count': len(edu['courses']) if edu['courses'] is not None else 0
        }
        education_data.append(edu_row)
    
    return pd.DataFrame(education_data)

In [47]:
def creating_table(file_path):
    files = os.listdir(file_path)
    index = 1
    
    candidate_df = pd.DataFrame()
    skills_df = pd.DataFrame()
    experience_df = pd.DataFrame()
    education_df = pd.DataFrame()
    
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(file_path,file),'r') as f:
                data = json.load(f)
            candidate_id = index
            filename = file
            current_candidate_df = create_candidate_df(data,filename,candidate_id)
            current_skills_df = create_skills(data,candidate_id)
            current_experience_df = create_experience(data,candidate_id)
            current_education_df = create_education_df(data,candidate_id)
            
            if index == 1:
                candidate_df = current_candidate_df
                skills_df = current_skills_df
                experience_df = current_experience_df
                education_df = current_education_df
            else:
                candidate_df = pd.concat([candidate_df, current_candidate_df], ignore_index=True)
                skills_df = pd.concat([skills_df, current_skills_df], ignore_index=True)
                experience_df = pd.concat([experience_df, current_experience_df], ignore_index=True)
                education_df = pd.concat([education_df, current_education_df], ignore_index=True)
            print(f"Processed resume {index}: {filename}")
            index += 1
    return candidate_df,skills_df,experience_df,education_df

In [51]:
basic,skills,experience,education = creating_table("../data/processed")

Processed resume 1: ADNAN AHMED - CV for Research Assistants - Lab Instructors.json
Processed resume 2: Awais Anwar - CV for Lab Instructor.json
Processed resume 3: Ebad Ali - CV for Research Assistant.json
Processed resume 4: Faisal Nisar - CV for for Research Assistant.json
Processed resume 5: Faisal Shahzad - CV for Lab Instructor.json
Processed resume 6: Faizan Asghar - CV for Research Assistant - Lab Instructor.json
Processed resume 7: Ghulam Jaffar - CV for Research Assistant.json
Processed resume 8: Habib Ur Rahman - CV for Lab Instructor - Research Assistant.json
Processed resume 9: Hafiz Ali Raja - CV for Lab Instructor.json
Processed resume 10: Haris Ahmed - CV for Research Assistant.json
Processed resume 11: Hitesh Kumar - Resume for Research Assistants - Lab Instructors.json
Processed resume 12: Jawad Ahmed Bhutta - CV for Lab Instructor - Research Assistant.json
Processed resume 13: Muhammad Azmi Umer - CV for Lab Instructor.json
Processed resume 14: Muhammad Omaid Sheikh 

  skills_df = pd.concat([skills_df, current_skills_df], ignore_index=True)
  skills_df = pd.concat([skills_df, current_skills_df], ignore_index=True)
  skills_df = pd.concat([skills_df, current_skills_df], ignore_index=True)


In [56]:
print(basic.info())
print(skills.info())
print(experience.info())
print(education.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   candidate_id            23 non-null     int64         
 1   filename                23 non-null     object        
 2   full_name               23 non-null     object        
 3   current_title           18 non-null     object        
 4   email                   23 non-null     object        
 5   phone                   23 non-null     object        
 6   city                    22 non-null     object        
 7   country                 20 non-null     object        
 8   linkedin                8 non-null      object        
 9   github                  8 non-null      object        
 10  summary                 22 non-null     object        
 11  total_skills            23 non-null     int64         
 12  total_experience_years  23 non-null     int64       

In [67]:
basic

Unnamed: 0,candidate_id,filename,full_name,current_title,email,phone,city,country,linkedin,github,summary,total_skills,total_experience_years,education_level,created_date
0,1,ADNAN AHMED - CV for Research Assistants - Lab...,ADNAN AHMED,,dani_a4a@yahoo.com,+92 (345) 8365922,,,,,To join a dedicated and professional team in t...,7,0,Unknown,2025-07-26 06:15:32.320824
1,2,Awais Anwar - CV for Lab Instructor.json,Awaiz Anwar,,,,,Pakistan,,,"Being a committed team player, want to be a pa...",4,0,Unknown,2025-07-26 06:15:32.323918
2,3,Ebad Ali - CV for Research Assistant.json,Ebad Ali,,ebadalie@gmail.com,+923022689940,Karachi,Pakistan,,,,10,0,Unknown,2025-07-26 06:15:32.327642
3,4,Faisal Nisar - CV for for Research Assistant.json,Hafiz Muhammad Ali Raja,Lab Assistant/ Network Assistant,alisherhaidri1984@gmail.com,+923216839854,Bahawalpur,Pakistan,,,To become a better & better IT resource with a...,6,0,Unknown,2025-07-26 06:15:32.331307
4,5,Faisal Shahzad - CV for Lab Instructor.json,Faisal Shahzad,,faisalkhan00668@yahoo.com,03137144006,Sargodha,,,,I am young mature & enthusiastic individual be...,17,0,Unknown,2025-07-26 06:15:32.336297
5,6,Faizan Asghar - CV for Research Assistant - La...,Faizan Asghar,,Faizanasghar_sm1@hotmail.com,03462662309,Karachi,Pakistan,,,To acquire a career oriented position in a rep...,5,0,Unknown,2025-07-26 06:15:32.339571
6,7,Ghulam Jaffar - CV for Research Assistant.json,,,ghulamjaffar110@gmail.com,+92.344.8534083,Rawalpindi,Pakistan,,,To seek a challenging and motivating career th...,16,0,Unknown,2025-07-26 06:15:32.341589
7,8,Habib Ur Rahman - CV for Lab Instructor - Rese...,Habib Ur Rahman,,Habib_tuf@hotmail.com,+923338967267,Sahiwal,Pakistan,,,To get the challenging position within the org...,6,0,Unknown,2025-07-26 06:15:32.341589
8,9,Hafiz Ali Raja - CV for Lab Instructor.json,Hafiz Muhammad Ali Raja,Lab Assistant/ Network Assistant,alisherhaidri1984@gmail.com,+923216839854,Bahawalpur,Pakistan,,,To become a better & better IT resource with a...,6,0,Unknown,2025-07-26 06:15:32.351920
9,10,Haris Ahmed - CV for Research Assistant.json,Haris Ahmed,,Engrharris.iu@gmail.com,(+92-345) 278 1022,Karachi,Pakistan,,,Seeking a position in a Progressive and Reputa...,4,0,Unknown,2025-07-26 06:15:32.358613


#### Analysis:

Row 6 has no name
15+ Ppl have empty current title
phone format inconsistency
some candidates have missing city fields

will now do extensive cleaning

In [72]:
import numpy as np
from datetime import datetime
import re

#Fixing empty names

def fix_empty_names(filename):
    name = filename.replace(".json","").replace(' - CV for Research Assistant', '').replace('Research Assistant','').replace(' - CV for Lab Instructor', '').replace(' - CV', '')
    return name.strip()


mask = (basic['full_name'].isna()) | (basic['full_name'] == "")
basic.loc[mask,'full_name'] = basic.loc[mask,'filename'].apply(fix_empty_names)
    

In [75]:
#Standardizing phone numbers

def clean_phone(phone):
    if pd.isna(phone) or phone == "":
        return None
    cleaned = re.sub(r'[^\d+]', '', str(phone))
    if cleaned.startswith('92') and not cleaned.startswith('+92'):
        cleaned = '+' + cleaned
    elif cleaned.startswith('0'):
        cleaned = '+92' + cleaned[1:]
    return cleaned

basic['phone'] = basic['phone'].apply(clean_phone)

In [76]:
#Standardizing cities

def clean_city(city):
    if pd.isna(city) or city == "":
        return None
    return city.strip().title()

basic['city'] = basic['city'].apply(clean_city)

### Giving each profile a profile completeness score

In [None]:
important_fields = ['full_name','email','phone','current_title']

basic['completeness_score'] = basic[important_fields].notna().sum(axis=1) * 25

basic

Unnamed: 0,candidate_id,filename,full_name,current_title,email,phone,city,country,linkedin,github,summary,total_skills,total_experience_years,education_level,created_date,completeness_score
0,1,ADNAN AHMED - CV for Research Assistants - Lab...,ADNAN AHMED,,dani_a4a@yahoo.com,+923458365922,,,,,To join a dedicated and professional team in t...,7,0,Unknown,2025-07-26 06:15:32.320824,100
1,2,Awais Anwar - CV for Lab Instructor.json,Awaiz Anwar,,,,,Pakistan,,,"Being a committed team player, want to be a pa...",4,0,Unknown,2025-07-26 06:15:32.323918,75
2,3,Ebad Ali - CV for Research Assistant.json,Ebad Ali,,ebadalie@gmail.com,+923022689940,Karachi,Pakistan,,,,10,0,Unknown,2025-07-26 06:15:32.327642,100
3,4,Faisal Nisar - CV for for Research Assistant.json,Hafiz Muhammad Ali Raja,Lab Assistant/ Network Assistant,alisherhaidri1984@gmail.com,+923216839854,Bahawalpur,Pakistan,,,To become a better & better IT resource with a...,6,0,Unknown,2025-07-26 06:15:32.331307,100
4,5,Faisal Shahzad - CV for Lab Instructor.json,Faisal Shahzad,,faisalkhan00668@yahoo.com,+923137144006,Sargodha,,,,I am young mature & enthusiastic individual be...,17,0,Unknown,2025-07-26 06:15:32.336297,100
5,6,Faizan Asghar - CV for Research Assistant - La...,Faizan Asghar,,Faizanasghar_sm1@hotmail.com,+923462662309,Karachi,Pakistan,,,To acquire a career oriented position in a rep...,5,0,Unknown,2025-07-26 06:15:32.339571,100
6,7,Ghulam Jaffar - CV for Research Assistant.json,Ghulam Jaffar Research Assistant,,ghulamjaffar110@gmail.com,+923448534083,Rawalpindi,Pakistan,,,To seek a challenging and motivating career th...,16,0,Unknown,2025-07-26 06:15:32.341589,100
7,8,Habib Ur Rahman - CV for Lab Instructor - Rese...,Habib Ur Rahman,,Habib_tuf@hotmail.com,+923338967267,Sahiwal,Pakistan,,,To get the challenging position within the org...,6,0,Unknown,2025-07-26 06:15:32.341589,100
8,9,Hafiz Ali Raja - CV for Lab Instructor.json,Hafiz Muhammad Ali Raja,Lab Assistant/ Network Assistant,alisherhaidri1984@gmail.com,+923216839854,Bahawalpur,Pakistan,,,To become a better & better IT resource with a...,6,0,Unknown,2025-07-26 06:15:32.351920,100
9,10,Haris Ahmed - CV for Research Assistant.json,Haris Ahmed,,Engrharris.iu@gmail.com,+923452781022,Karachi,Pakistan,,,Seeking a position in a Progressive and Reputa...,4,0,Unknown,2025-07-26 06:15:32.358613,75


#### Transforming Skills DataFrame

In [78]:
skills

Unnamed: 0,candidate_id,skill_name,proficiency,category,years_of_experience
0,1,Anritsu Site Master,,Technical,
1,1,Huwaie DBS_3900 and BTS-3900,profound knowledge,Technical,
2,1,Commissioning of BTS,,Technical,
3,1,3D Simulation and Design of Antennas,,Technical,
4,1,"Microsoft Word, Excel, PowerPoint, Access, InPage",,Software,
...,...,...,...,...,...
217,23,Programming Languages,,Technical,
218,23,Database Knowledge,,Technical,
219,23,Web technology,,Technical,
220,23,Software,,Technical,


In [81]:
def split_skills(skill_name):
    if ',' in skill_name:
        return [s.strip() for s in skill_name.split(',')]
    return [skill_name]

#now creating new rows for split skills
new_skills = []
for idx, row in skills.iterrows():
    skill_list = split_skills(row['skill_name'])
    for skill in skill_list:
        new_row = row.copy()
        new_row['skill_name'] = skill
        new_skills.append(new_row)
        
        
skills_cleaned = pd.DataFrame(new_skills)
skills_cleaned.tail(20)
    
        

Unnamed: 0,candidate_id,skill_name,proficiency,category,years_of_experience
202,20,Email,,Internet,
203,21,MATLAB,medium,Technical,
204,21,ETAP,basic,Technical,
205,21,Minitab,basic,Technical,
206,21,CALCULUX /DIALUX,medium,Technical,
207,21,MS Office,,Technical,
208,22,Circuit Design,,Electrical Engineering,
209,22,Microcontroller Programming,,Electrical Engineering,
210,22,Software,,Electrical Engineering,
211,22,PLC Programming,,Electrical Engineering,


In [82]:
#skill mappings

skill_mapping = {
        'Microsoft Word': 'MS Word',
        'Microsoft Excel': 'MS Excel', 
        'Microsoft PowerPoint': 'MS PowerPoint',
        'Microsoft Access': 'MS Access',
}

skills_cleaned['skill_name'] = skills_cleaned['skill_name'].map(skill_mapping).fillna(skills_cleaned['skill_name'])

In [83]:
#Categorizing skills

def categorize_skills(skill_name):
    skill_lower = skill_name.lower()
    if any(lang in skill_lower for lang in ['python','java','c++','c#','javascript']):
        return 'Programming Language'
    elif any(db in skill_lower for db in ['sql','mysql','database']):
            return 'Database'
    elif any(ms in skill_lower for ms in ['word','excel','powerpoint','access']):
        return 'Office Tools'
    elif 'unity' in skill_lower:
        return 'Game Development'
    elif any(web in skill_lower for web in ['asp.net','react','node.js']):
        return 'Web Development'
    else:
        return 'Technical'
    
skills_cleaned['category'] = skills_cleaned['skill_name'].apply(categorize_skills)

skills_cleaned

Unnamed: 0,candidate_id,skill_name,proficiency,category,years_of_experience
0,1,Anritsu Site Master,,Technical,
1,1,Huwaie DBS_3900 and BTS-3900,profound knowledge,Technical,
2,1,Commissioning of BTS,,Technical,
3,1,3D Simulation and Design of Antennas,,Technical,
4,1,MS Word,,Office Tools,
...,...,...,...,...,...
217,23,Programming Languages,,Technical,
218,23,Database Knowledge,,Database,
219,23,Web technology,,Technical,
220,23,Software,,Technical,


#### Cleaning experience DF

In [88]:
experience['start_date'] = pd.to_datetime(experience['start_date'],errors='coerce')
experience['end_date'] = pd.to_datetime(experience['end_date'],errors = 'coerce')

In [87]:
def calculate_duration(start,end):
    if pd.isna(start):
        return None
    end_date = end if not pd.isna(end) else pd.Timestamp.now()
    duration = (end_date - start).days/365.25
    return round(duration,2)

experience['duration_yrs'] = experience.apply(lambda x:calculate_duration(x['start_date'],x['end_date']),axis=1)

experience.head(5)


Unnamed: 0,candidate_id,job_title,company,start_date,end_date,employment_type,location,is_current_job,responsibilities_count,skills_used_count,duration_yrs
0,2,Intern,TECHLOGIX,2012-07-23,2012-08-31,Internship,,False,1,0,0.11
1,3,Junior Android Developer,Salsoft Technologies Pvt.Ltd,2013-01-01,2013-12-31,Full-time,Pakistan,False,1,1,1.0
2,3,Internee,PIA-Computer Center,2013-01-01,2013-12-31,Internship,Pakistan,False,2,1,1.0
3,4,Lab Assistant/ Network Assistant,"Air University, Multan",NaT,NaT,Full-time,Multan,True,0,0,
4,4,Lab Supervisor,"COMSATS Institute of Information Technology, S...",NaT,NaT,Full-time,Sahiwal,True,0,0,


In [89]:
employment_mapping = {
        'Full-time': 'Full-time',
        'Part-time': 'Part-time', 
        'Internship': 'Internship',
        'Intern': 'Internship',
        'Current': 'Full-time',
        'Program': 'Internship',
        None: 'Unknown'
    }

experience['employment_type'] = experience['employment_type'].map(employment_mapping).fillna('Unknown')

def clean_company(company):
    if pd.isna(company) or company == "":
        return 'Unknown'
    return company.strip()

experience['company'] = experience['company'].apply(clean_company)
 

#### Cleaning education df

In [90]:
education

Unnamed: 0,candidate_id,degree,field,institution,start_date,end_date,grade,location,courses_count
0,1,B.E,Telecommunication Engineering,"Hamdard University, Karachi",2010-01-01,2014-12-31,3.1,Karachi,0
1,1,F.Sc.,Pre Engineering,"Tameer-e-Nau Public College, Quetta",2008-01-01,2009-12-31,74%,Quetta,0
2,1,Matriculation,,"Government high School Killi Sheikhan, Quetta",2006-01-01,2007-12-31,75%,Quetta,0
3,2,B. Sc.,Computer System Engineering,"The Islamia University, Bahawalpur, Pakistan",2010-01-01,2014-12-31,CGPA 3.50/4.0,"Bahawalpur, Pakistan",0
4,2,F. Sc.,Pre-Engineering,"Superior College, Muzaffargarh, Pakistan",2006-08-01,2009-10-31,%AGE 76.00,"Muzaffargarh, Pakistan",0
...,...,...,...,...,...,...,...,...,...
68,22,D.A.E (Electrical),Electrical,Punjab Board of Technical Education,2007-01-01,2009-12-31,,,0
69,22,Matriculation (Science),Science,Rawalpindi Board,2003-01-01,2006-12-31,,Rawalpindi,0
70,23,Bachelor,Computer Science,"Federal Urdu University of Arts, Science and T...",2012-01-01,2016-12-31,3.79/4.0,"Karachi, Pakistan",10
71,23,Intermediate,Pre-Engineering,F.G Boys Inter College Karachi Cantt,2009-01-01,2011-12-31,,"Karachi, Pakistan",0


In [92]:
degree_mapping = {
        'B.E': 'Bachelor',
        'B. Sc.': 'Bachelor', 
        'Bachelor': 'Bachelor',
        'MS': 'Master',
        'Master': 'Master',
        'Masters': 'Master',
        'PhD': 'PhD',
        'Ph.D': 'PhD',
        'F.Sc.': 'Intermediate',
        'F. Sc.': 'Intermediate',
        'Intermediate': 'Intermediate',
        'Matriculation': 'Matric',
        'Matric': 'Matric',
        'D.A.E': 'Diploma'
    }

education['degree'] = education['degree'].map(degree_mapping).fillna(education['degree'])

education

Unnamed: 0,candidate_id,degree,field,institution,start_date,end_date,grade,location,courses_count
0,1,Bachelor,Telecommunication Engineering,"Hamdard University, Karachi",2010-01-01,2014-12-31,3.1,Karachi,0
1,1,Intermediate,Pre Engineering,"Tameer-e-Nau Public College, Quetta",2008-01-01,2009-12-31,74%,Quetta,0
2,1,Matric,,"Government high School Killi Sheikhan, Quetta",2006-01-01,2007-12-31,75%,Quetta,0
3,2,Bachelor,Computer System Engineering,"The Islamia University, Bahawalpur, Pakistan",2010-01-01,2014-12-31,CGPA 3.50/4.0,"Bahawalpur, Pakistan",0
4,2,Intermediate,Pre-Engineering,"Superior College, Muzaffargarh, Pakistan",2006-08-01,2009-10-31,%AGE 76.00,"Muzaffargarh, Pakistan",0
...,...,...,...,...,...,...,...,...,...
68,22,D.A.E (Electrical),Electrical,Punjab Board of Technical Education,2007-01-01,2009-12-31,,,0
69,22,Matriculation (Science),Science,Rawalpindi Board,2003-01-01,2006-12-31,,Rawalpindi,0
70,23,Bachelor,Computer Science,"Federal Urdu University of Arts, Science and T...",2012-01-01,2016-12-31,3.79/4.0,"Karachi, Pakistan",10
71,23,Intermediate,Pre-Engineering,F.G Boys Inter College Karachi Cantt,2009-01-01,2011-12-31,,"Karachi, Pakistan",0


In [93]:
def clean_grade(grade):
    if pd.isna(grade) or grade == "":
        return None
    grade_str = str(grade).lower()
    if 'cgpa' in grade_str:
        cgpa_match = re.search(r'(\d+\.?\d*)', grade_str)
        if cgpa_match:
            return float(cgpa_match.group(1))
    elif "%" in grade_str:
        percent_match = re.search(r'(\d+)', grade_str)
        if percent_match:
            return float(percent_match.group(1))
    return grade

education['grade'] = education['grade'].apply(clean_grade)
education

Unnamed: 0,candidate_id,degree,field,institution,start_date,end_date,grade,location,courses_count
0,1,Bachelor,Telecommunication Engineering,"Hamdard University, Karachi",2010-01-01,2014-12-31,3.1,Karachi,0
1,1,Intermediate,Pre Engineering,"Tameer-e-Nau Public College, Quetta",2008-01-01,2009-12-31,74.0,Quetta,0
2,1,Matric,,"Government high School Killi Sheikhan, Quetta",2006-01-01,2007-12-31,75.0,Quetta,0
3,2,Bachelor,Computer System Engineering,"The Islamia University, Bahawalpur, Pakistan",2010-01-01,2014-12-31,3.5,"Bahawalpur, Pakistan",0
4,2,Intermediate,Pre-Engineering,"Superior College, Muzaffargarh, Pakistan",2006-08-01,2009-10-31,76.0,"Muzaffargarh, Pakistan",0
...,...,...,...,...,...,...,...,...,...
68,22,D.A.E (Electrical),Electrical,Punjab Board of Technical Education,2007-01-01,2009-12-31,,,0
69,22,Matriculation (Science),Science,Rawalpindi Board,2003-01-01,2006-12-31,,Rawalpindi,0
70,23,Bachelor,Computer Science,"Federal Urdu University of Arts, Science and T...",2012-01-01,2016-12-31,3.79/4.0,"Karachi, Pakistan",10
71,23,Intermediate,Pre-Engineering,F.G Boys Inter College Karachi Cantt,2009-01-01,2011-12-31,,"Karachi, Pakistan",0


In [94]:
#level_mappping

level_mapping = {
    'Phd' : 5,
    'Master': 4,
    'Bachelor': 3, 
    'Diploma': 2,
    'Intermediate': 1,
    'Matric': 0
}

education['level'] = education['degree'].map(level_mapping).fillna(0)


#### Data Cleaning done, Now finding derived calculations

In [96]:
def derived_fields(basic,skills,experience,education):
    total_exp = experience.groupby('candidate_id')['duration_yrs'].sum().reset_index()
    total_exp.columns = ['candidate_id','total_exp_yrs']
    
    # getting highest education level
    highest_edu = education.loc[education.groupby('candidate_id')['level'].idxmax()]
    highest_edu = highest_edu[['candidate_id','degree','level']]
    highest_edu.columns = ['candidate_id','highest_degree','level']
    
    #finding skill diversity per candidate
    skill_diversity = skills.groupby('candidate_id')['category'].nunique().reset_index()
    skill_diversity.columns = ['candidate_id','skill_diversity_score']
    
    #Merging all derived fields back to basic info df
    
    basic_enhanced = basic.merge(total_exp,on='candidate_id',how='left')
    basic_enhanced = basic_enhanced.merge(highest_edu,on='candidate_id',how='left')
    basic_enhanced = basic_enhanced.merge(skill_diversity,on='candidate_id',how='left')
    
    basic_enhanced['total_exp_yrs'] = basic_enhanced['total_exp_yrs'].fillna(0)
    basic_enhanced['skill_diversity_score'] = basic_enhanced['skill_diversity_score'].fillna(0)
    
    return basic_enhanced

basic_final = derived_fields(basic,skills_cleaned,experience,education)
basic_final


    

Unnamed: 0,candidate_id,filename,full_name,current_title,email,phone,city,country,linkedin,github,summary,total_skills,total_experience_years,education_level,created_date,completeness_score,total_exp_yrs,highest_degree,level,skill_diversity_score
0,1,ADNAN AHMED - CV for Research Assistants - Lab...,ADNAN AHMED,,dani_a4a@yahoo.com,+923458365922,,,,,To join a dedicated and professional team in t...,7,0,Unknown,2025-07-26 06:15:32.320824,100,0.0,Bachelor,3.0,2
1,2,Awais Anwar - CV for Lab Instructor.json,Awaiz Anwar,,,,,Pakistan,,,"Being a committed team player, want to be a pa...",4,0,Unknown,2025-07-26 06:15:32.323918,75,0.11,Bachelor,3.0,3
2,3,Ebad Ali - CV for Research Assistant.json,Ebad Ali,,ebadalie@gmail.com,+923022689940,Karachi,Pakistan,,,,10,0,Unknown,2025-07-26 06:15:32.327642,100,2.0,,,3
3,4,Faisal Nisar - CV for for Research Assistant.json,Hafiz Muhammad Ali Raja,Lab Assistant/ Network Assistant,alisherhaidri1984@gmail.com,+923216839854,Bahawalpur,Pakistan,,,To become a better & better IT resource with a...,6,0,Unknown,2025-07-26 06:15:32.331307,100,0.0,BS (Computer Science),0.0,4
4,5,Faisal Shahzad - CV for Lab Instructor.json,Faisal Shahzad,,faisalkhan00668@yahoo.com,+923137144006,Sargodha,,,,I am young mature & enthusiastic individual be...,17,0,Unknown,2025-07-26 06:15:32.336297,100,0.0,MATRIC,0.0,3
5,6,Faizan Asghar - CV for Research Assistant - La...,Faizan Asghar,,Faizanasghar_sm1@hotmail.com,+923462662309,Karachi,Pakistan,,,To acquire a career oriented position in a rep...,5,0,Unknown,2025-07-26 06:15:32.339571,100,1.0,Intermediate,1.0,1
6,7,Ghulam Jaffar - CV for Research Assistant.json,Ghulam Jaffar Research Assistant,,ghulamjaffar110@gmail.com,+923448534083,Rawalpindi,Pakistan,,,To seek a challenging and motivating career th...,16,0,Unknown,2025-07-26 06:15:32.341589,100,15.31,Master,4.0,6
7,8,Habib Ur Rahman - CV for Lab Instructor - Rese...,Habib Ur Rahman,,Habib_tuf@hotmail.com,+923338967267,Sahiwal,Pakistan,,,To get the challenging position within the org...,6,0,Unknown,2025-07-26 06:15:32.341589,100,14.48,MS(CS),0.0,1
8,9,Hafiz Ali Raja - CV for Lab Instructor.json,Hafiz Muhammad Ali Raja,Lab Assistant/ Network Assistant,alisherhaidri1984@gmail.com,+923216839854,Bahawalpur,Pakistan,,,To become a better & better IT resource with a...,6,0,Unknown,2025-07-26 06:15:32.351920,100,0.0,BS (Computer Science),0.0,4
9,10,Haris Ahmed - CV for Research Assistant.json,Haris Ahmed,,Engrharris.iu@gmail.com,+923452781022,Karachi,Pakistan,,,Seeking a position in a Progressive and Reputa...,4,0,Unknown,2025-07-26 06:15:32.358613,75,11.64,BE-EE (Electronics Engineering),0.0,1


### Data Cleaning has been done


In [99]:
basic_final.to_csv("../data/cleaned/basic_info_cleaned.csv",index=False)
skills_cleaned.to_csv("../data/cleaned/skills_cleaned.csv",index=False)
experience.to_csv("../data/cleaned/experience_cleaned.csv",index=False)
education.to_csv("../data/cleaned/education_cleaned.csv",index=False)