In [1]:
import pandas as pd
import json
import numpy as np
import spacy
import nltk
import tqdm  #Shows Progress
import os
import srsly
from spacy import displacy
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Loading the mapping dictionary file
#Key = ID and Value = resume file name
with open("E:\\Arpitha Swamydas\\Ev Project\\fileId.json") as df:   #fileId file as mapping of id and resume file name
    resumeMap = json.load(df)

In [4]:
path = "E:/Arpitha Swamydas/Ev Project/all_JSON/"
path

'E:/Arpitha Swamydas/Ev Project/all_JSON/'

In [106]:
# Loading resume files in loop
data = dict()   # data is a dict that as the mapping of fileID and their parsed resume
for id_ in tqdm.tqdm(resumeMap):
    try:  
        with open(os.path.join(path, resumeMap[id_]), 'r', encoding = "utf- 8") as f:
            data[id_] = json.load(f)
        id_maps[int(id_)] = int(id_)
    except json.JSONDecodeError:
        print(id_)

100%|█████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 152.27it/s]

4
33
51
54





In [6]:
skillset = []   # skill set list 
dataSkills = dict()   #dataSkill dict

# For every id in data, append the skillset list with the Formatted skillname
# and map dataSkills dict i.e. alias = Formatted name

for id_ in tqdm.tqdm(data):  
    for skill in data[id_]['ResumeParserData']["SegregatedSkill"]:
        skillset.append(skill["FormattedName"].strip().lower())
        for alias in skill["Alias"].split(","):
            dataSkills[alias.strip().lower()] = skill["FormattedName"].strip().lower()
            skillset.append(alias.strip().lower())
skillset = set(skillset)   #List to Set

100%|████████████████████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 1090.12it/s]


# SPAN RULER for Listing the Skills

In [7]:
#Writing Pattern File

patterns = []
for alias,skill in dataSkills.items():   #returns key:value as tuple
        patterns.append({"label" : skill , "pattern" : [{"LOWER" :i} for i in alias.split(' ')]})
srsly.write_jsonl('E:\\Arpitha Swamydas\\Ev Project\\patterns.jsonl' , patterns)

In [8]:
patterns = list(srsly.read_jsonl("patterns.jsonl"))    

In [9]:
nlp_skills = spacy.blank("en")
ruler = nlp_skills.add_pipe("span_ruler")
ruler.add_patterns(patterns)

In [10]:
def skills(file, nlp=nlp_skills):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    doc = nlp(data["ResumeParserData"]["DetailResume"])
    return{"skills":[span.text for span in doc.spans["ruler"]]}   #Returns a dict where key = skills and values=span.text(Formatted Names)

In [11]:
for id_ in data:
    print("\n\n", data[id_]["ResumeParserData"]["Name"]["FullName"], "\n")
    result = skills(os.path.join(path,resumeMap[id_]))
    result = list(sorted({ele for val in result.values() for ele in val}))  #Listing unique values from dictionary
    print(result) #FINAL OUTPUT



 Prasanth Kumar Katta 

['API', 'API Integration', 'API integration', 'Android Architecture Components', 'Core Java', 'GUI', 'Html', 'Kotlin', 'Mobile Programming', 'Model', 'Programming Skills', 'Quick learner', 'Real Time Database', 'Technical Skills', 'UI', 'View', 'Visual Studio', 'Web Services', 'adaptable', 'responsive design', 'version control tools']


 Imran Iqbal Shaikh 

['Agile', 'Agile Scrum', 'Agile Scrum framework', 'Angular', 'B2B', 'C', 'CSS', 'CXM', 'Case Management', 'Coaching', 'Code reviews', 'Confluence', 'DevOps', 'Dynamics 365', 'HTML', 'Image editing software', 'JSON', 'Meeting Scheduling', 'Performance evaluation', 'Planning', 'Programming Languages', 'Proof of Concepts', 'QA', 'Scrum', 'Scrum framework', 'View', 'VoIP', 'Web Services', 'app development', 'code base', 'code reviews', 'control tests', 'iOS app development', 'macOS', 'management application', 'project objectives', 'requirement gathering', 'system development']


 Muralidhara V 

['2003 Server'

# Luhn Summarization

In [12]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer

In [13]:
def clean_(summary):
    stop_words = nltk.corpus.stopwords.words('english')
    return [w for w in gensim.utils.simple_preprocess(str(summary)) if w not in stop_words]

In [14]:
sum_corpus = []
summary_dict = []  
for id_ in data:
    text = data[id_]['ResumeParserData']["DetailResume"]
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    
    summarizer_luhn = LuhnSummarizer()
    summary =summarizer_luhn(parser.document,5)  #Summarizes to 5 Sentences

    summary_dict.append({"fileID": id_, "summary": clean_(summary)})

srsly.write_jsonl("E:\\Arpitha Swamydas\\Ev Project\\summary file.jsonl", summary_dict)

In [15]:
sumFile = list(srsly.read_jsonl("E:\\Arpitha Swamydas\\Ev Project\\summary file.jsonl"))    

# Word2Vec

In [16]:
sentences = [i['summary'] for i in sumFile]  #reading all the sentences from the file

In [17]:
#Vocab Creation on corpus
model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=1, workers=4)
#corpus = model.wv.vectors #vectors of all summary

8250

In [118]:
model.wv.vectors.shape, all_vec.shape

((2273, 300), (64, 300))

In [18]:
for i in sumFile:
    i["docVec"] = model.wv[i['summary']].mean(axis=0)

In [19]:
all_vec = np.array([i['docVec'] for i in sumFile])
def most_sim(vec):
    i = (-model.wv.cosine_similarities(vec, all_vec)).argsort()[1:6]
    return [sumFile[j]['fileID'] for j in i]

In [129]:
print(data['0']["ResumeParserData"]["DetailResume"])

Prasanth Kumar Katta
Phone No : 8790296753
Email Id : kpcode4u@gmail.com

Profile Summary

	  Udacity Certified Co-Created With Google - Android Developer Nanodegree Graduate.
	  Having 3.7+ year of Experience in developing Client/Server Applications using Java in developing Mobile
	  Applications using Android Technologies.
	  Good Experience on Jet Pack Libraries (Architecture components like Room, LiveData and View Model,
	  Navigation Components and Binding Views .etc) 
	  Worked as one of Coordinator cum Trainer in Google Code Lab.
	  Strong Programming Skills in Java, Android and Firebase.
	  Worked as one of the Self Learning Coordinator Lead in APSSDC.
	  Good knowledge on developing Android applications by using Eclipse and Android Studio.
	  Good experience in working with XML and UI.
	  Good experience Google API Console, Android Architecture Components, Firebase and Sockets.
	  Strong hands on experience in Google maps (GPS) , version control tools like git and bitbucket.
	

In [137]:
print(data['46']["ResumeParserData"]["DetailResume"])

JITENDRA KUMAR
#S02 Bellathur Kadugodi, Near- SBI Kadugodi Branch Whitefield Bangalore-560067	9980880469
Ji7endra.kumar@gmail.com

. Seeking a responsible job mostly in a StartUp Organization with an opportunity for professional challenges, career growth and which allows me to contribute towards achieving the organizational goals.
. Full Stack/Web Developer with over 1 year of professional experience in Web Designing and User Interface
Development using recent Web Technologies and frameworks.
. Experienced in MERN Stack Developer.
. Experienced in JavaScript frameworks and libraries like ReactJS, NodeJS.
. Development of Applications based on approved designs.
. Building user-friendly interfaces.
. Knowledge of how to interact with RESTful APIs and formats (basic JSON, XML) .
. Database Design.
. Good understanding of Frontend Development using React.js and Backend using Node.js and MySQL/MongoDB.

EXPERIENCE

JULY 2019 - PRESENT
FULLSTACK DEVELOPER, TATA CONSULTANCY SERVICES, BANGALOR

In [20]:
most_sim(sumFile[0]['docVec'])

['14', '9', '45', '20', '47']

In [21]:
for i in sumFile:
    i['most_similar'] = most_sim(i['docVec'])

In [22]:
sumFile[0]['most_similar']

['14', '9', '45', '20', '47']

In [23]:
sumFile[0]

{'fileID': '0',
 'summary': ['sentence',
  'project',
  'brentwood',
  'carpool',
  'team',
  'size',
  'environment',
  'java',
  'kotlin',
  'android',
  'firebase',
  'role',
  'team',
  'member',
  'worked',
  'chat',
  'push',
  'notifications',
  'using',
  'firebase',
  'sentence',
  'project',
  'view',
  'box',
  'able',
  'play',
  'videos',
  'like',
  'youtube',
  'displaying',
  'documents',
  'pdfs',
  'audios',
  'team',
  'size',
  'environment',
  'core',
  'java',
  'android',
  'dashboard',
  'add',
  'videos',
  'audios',
  'documents',
  'role',
  'team',
  'member',
  'description',
  'available',
  'add',
  'videos',
  'documents',
  'audios',
  'etc',
  'sentence',
  'project',
  'agri',
  'aiuto',
  'multiple',
  'language',
  'database',
  'support',
  'crop',
  'merchant',
  'pesticides',
  'weather',
  'report',
  'expert',
  'chat',
  'box',
  'functionality',
  'team',
  'size',
  'environment',
  'core',
  'java',
  'android',
  'third',
  'party',
  'lib

# KNN

In [134]:
#Model building for all_vec array
nbrs = NearestNeighbors(n_neighbors=6).fit(all_vec)

In [135]:
#nbrs.kneighbors_graph(all_vec).toarray()

In [136]:
nbrs.kneighbors(X = sumFile[0]['docVec'].reshape(1, -1))

(array([[4.65661287e-10, 5.81214322e-03, 5.98599543e-03, 6.06947736e-03,
         6.08471658e-03, 6.14214179e-03]]),
 array([[ 0, 27, 17, 30, 31, 14]], dtype=int64))

In [114]:
map_table = {p:q for p, q in enumerate([i['fileID'] for i in sumFile])}
def knn_similar(vec):
    distances, indices = nbrs.kneighbors(X = vec)
    #print(list(sumFile[j]['fileID'] for j in indices[:, 1:].flatten()))
    return [map_table[j] for j in indices[:, 1:].flatten()]

In [115]:
knn_similar(sumFile[-1]['docVec'].reshape(1, -1))

['42', '35', '24', '12', '7']

In [57]:
#for i in sumFile:
    #i['knn_similar'] = knn_similar(i['docVec'])

In [109]:
for i in sumFile:
    i['knn_similar'] = knn_similar(i['docVec'].reshape(1, -1))

In [110]:
print(sumFile[0]['most_similar'])
print(sumFile[0]['knn_similar'])

['14', '9', '45', '20', '47']
['28', '18', '31', '32', '15']


In [62]:
print(sumFile[13]['most_similar'])
print(sumFile[13]['knn_similar'])

['28', '20', '42', '47', '35']
['47', '36', '45', '20', '26']


In [46]:
sumFile[0]

{'fileID': '0',
 'summary': ['sentence',
  'project',
  'brentwood',
  'carpool',
  'team',
  'size',
  'environment',
  'java',
  'kotlin',
  'android',
  'firebase',
  'role',
  'team',
  'member',
  'worked',
  'chat',
  'push',
  'notifications',
  'using',
  'firebase',
  'sentence',
  'project',
  'view',
  'box',
  'able',
  'play',
  'videos',
  'like',
  'youtube',
  'displaying',
  'documents',
  'pdfs',
  'audios',
  'team',
  'size',
  'environment',
  'core',
  'java',
  'android',
  'dashboard',
  'add',
  'videos',
  'audios',
  'documents',
  'role',
  'team',
  'member',
  'description',
  'available',
  'add',
  'videos',
  'documents',
  'audios',
  'etc',
  'sentence',
  'project',
  'agri',
  'aiuto',
  'multiple',
  'language',
  'database',
  'support',
  'crop',
  'merchant',
  'pesticides',
  'weather',
  'report',
  'expert',
  'chat',
  'box',
  'functionality',
  'team',
  'size',
  'environment',
  'core',
  'java',
  'android',
  'third',
  'party',
  'lib