# Dependencies and Required Packages

In [1]:
!pip install pdfminer.six
!pip install nltk

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
from pdfminer.high_level import extract_text
import nltk
import re
import subprocess
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import json

# Generating Text Data From PDF File

In [4]:
def extract_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    return text

txt = extract_text_from_pdf("/content/Rajat's Resume.pdf")


In [5]:
print(txt)

Rajat Agarwaal
Data Scientist
Skilled Data Scientist with 2.5 years of experience executing data driven solutions to increase eﬃciency,
accuracy and utility of internal data processing. Experienced at creating regression models, classiﬁcation
models  using  predictive  modelling,  Computer  Vision  and  analyzing  data  mining  algorithms  to  deliver
insights and implement action oriented solutions to complex business problems.

rajatagarwaal30@gmail.com

Ghaziabad (NCR Region), India

linkedin.com/in/rajat-agarwaal

WORK EXPERIENCE

Machine Learning Engineer
AgEYE Technologies
01/2021 - Present, 

Achievements/Tasks

09958168687

30 March, 1994

SKILLS

Python

C

Machine Learning

Deep Learning

Bangalore, India

Computer Vision

Natural Language Processing

SQL

Pig

Hive

Basics of HDFS

Undertaking data collection, preprocessing and analysis

Building models to address business problems

Statistical Analysis

Gitlab

Tableau

Propose solutions and strategies to business challenge

# Generating Name using NER

In [6]:
def extract_names(txt):
    person_names = []

    for sent in nltk.sent_tokenize(txt):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                person_names.append(
                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves())
                )

    return person_names

In [7]:
names = extract_names(txt)

In [8]:
names

['Rajat',
 'Agarwaal Data Scientist Skilled Data Scientist',
 'Machine Learning Engineer',
 'Machine Learning Deep Learning Bangalore',
 'Gitlab Tableau',
 'Machine Learning',
 'Python',
 'Engineer Ericsson Global India Pvt',
 'Uttar Pradesh Built',
 'Applied ML',
 'Machine Learning',
 'Maharashtra PROJECTS Instance',
 'Instance Segmentation',
 'Basil',
 'Lettuce',
 'Hemp Image',
 'Custom CNN Architecture',
 'Stem Girth',
 'Plant Height',
 'Fresh Weight',
 'Dry Weight',
 'Basil',
 'Lettuce Classes',
 'Customer Churn Analysis',
 'Salary Advance Product',
 'Objective',
 'Random',
 'Marketing Campaign',
 'Silhouette Score',
 'Greater Noida',
 'Uttar Pradesh',
 'Python',
 'Sql',
 'Hacker Rank']

In [9]:
name_candidate=names[0] + ' ' + names[1].split(' ')[0]

# Extracting Phone Number

In [10]:
PHONE_REG_IND = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')

PHONE_REG_USA = re.compile(r'/^\(?(\d{3})\)?[-]?(\d{3})[-]?(\d{4})$/')

In [11]:
def extract_phone_number(resume_text):
    phone = re.findall(PHONE_REG_IND, resume_text)

    if phone:
        number = ''.join(phone[0])

        if resume_text.find(number) >= 0 and len(number) < 16:
            return number
    return None
 
phone_number_ind = extract_phone_number(txt)
print(phone_number_ind)

9958168687


In [12]:
def extract_phone_number(resume_text):
    phone = re.findall(PHONE_REG_USA, resume_text)

    if phone:
        if resume_text.find(phone) >= 0:
            return phone
    return None
 
phone_number_usa = extract_phone_number(txt)
print(phone_number_usa)

None


# Extracting EMAIL ID

In [13]:
EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')

In [14]:
def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)


In [15]:
emails = extract_emails(txt)

if emails:
    print(emails)
    

['rajatagarwaal30@gmail.com']


#### General Info Dictionary

In [16]:
general_dict={ 'Name' : name_candidate.upper(),
              'email' : emails ,
              'contact' : phone_number_ind
    
}

general_dict

{'Name': 'RAJAT AGARWAAL',
 'contact': '9958168687',
 'email': ['rajatagarwaal30@gmail.com']}

# Extracting Skills from the Text

In [17]:
SKILLS_DB = [
    'machine learning',
    'data science',
    'python',
    'word',
    'excel',
    'english',
    'sql',
    'deep learning',
    'nlp',
    'natural language processing',
    'computer vision',
    'pig',
    'statistical analysis',
    'gitlab',
    'tableau'
]

### Skills Extraction Loop

In [18]:
from collections import defaultdict
skills_dict = {}

    


def extract_skills(input_text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(input_text)

    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w not in stop_words]

    # remove the punctuation
    filtered_tokens = [w for w in word_tokens if w.isalpha()]

    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 1, 3)))

    # we create a set to keep the results in.
    found_skills = set()

    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB:
            found_skills.add(token)
    
    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)
            
#     print(found_skills)

#     return found_skills
    for skill in found_skills :
      if skill.upper() not in skills_dict.keys():
          skill = skill.lower()
          cnt = 0
          for i in bigrams_trigrams:
              i = i.lower()
              if skill in i:
                  cnt += 1
          print(skill.upper(), ' is repeated ' , cnt, ' times.')
          skills_dict[skill.upper()]= cnt
#     print(list(bigrams_trigrams))
            

In [19]:
extract_skills(txt)

# print(skills) 

GITLAB  is repeated  6  times.
MACHINE LEARNING  is repeated  15  times.
PYTHON  is repeated  24  times.
SQL  is repeated  18  times.
PIG  is repeated  6  times.
NATURAL LANGUAGE PROCESSING  is repeated  1  times.
STATISTICAL ANALYSIS  is repeated  3  times.
DATA SCIENCE  is repeated  3  times.
TABLEAU  is repeated  6  times.
DEEP LEARNING  is repeated  3  times.
COMPUTER VISION  is repeated  9  times.


In [20]:
skills_dict

{'COMPUTER VISION': 9,
 'DATA SCIENCE': 3,
 'DEEP LEARNING': 3,
 'GITLAB': 6,
 'MACHINE LEARNING': 15,
 'NATURAL LANGUAGE PROCESSING': 1,
 'PIG': 6,
 'PYTHON': 24,
 'SQL': 18,
 'STATISTICAL ANALYSIS': 3,
 'TABLEAU': 6}

In [21]:
general_dict["skills"]=skills_dict

### Final Information

In [22]:
general_dict

{'Name': 'RAJAT AGARWAAL',
 'contact': '9958168687',
 'email': ['rajatagarwaal30@gmail.com'],
 'skills': {'COMPUTER VISION': 9,
  'DATA SCIENCE': 3,
  'DEEP LEARNING': 3,
  'GITLAB': 6,
  'MACHINE LEARNING': 15,
  'NATURAL LANGUAGE PROCESSING': 1,
  'PIG': 6,
  'PYTHON': 24,
  'SQL': 18,
  'STATISTICAL ANALYSIS': 3,
  'TABLEAU': 6}}

#### Importing Base file to be written


In [23]:
with open('base.json', 'r') as openfile:
  
    # Reading from json file
    json_object_ip = json.load(openfile)
  
print(json_object_ip)
print(type(json_object_ip))

{'id1': {'Name': 'RAJAT AGARWAAL', 'email': ['rajatagarwaal30@gmail.com'], 'contact': '9958168687', 'skills': {'STATISTICAL ANALYSIS': 3, 'SQL': 18, 'DEEP LEARNING': 3, 'TABLEAU': 6, 'MACHINE LEARNING': 15, 'GITLAB': 6, 'PYTHON': 24, 'DATA SCIENCE': 3, 'PIG': 6, 'NATURAL LANGUAGE PROCESSING': 1, 'COMPUTER VISION': 9}}}
<class 'dict'>


In [24]:
json_object_ip

{'id1': {'Name': 'RAJAT AGARWAAL',
  'contact': '9958168687',
  'email': ['rajatagarwaal30@gmail.com'],
  'skills': {'COMPUTER VISION': 9,
   'DATA SCIENCE': 3,
   'DEEP LEARNING': 3,
   'GITLAB': 6,
   'MACHINE LEARNING': 15,
   'NATURAL LANGUAGE PROCESSING': 1,
   'PIG': 6,
   'PYTHON': 24,
   'SQL': 18,
   'STATISTICAL ANALYSIS': 3,
   'TABLEAU': 6}}}

In [25]:
final_dict={}
final_dict_temp={}
new_id=len(json_object_ip.keys())+1
#new_id=1
print(" Id number {0} is generated".format(new_id))
final_dict_temp['id'+str(new_id)]=general_dict

final_dict={**json_object_ip , **final_dict_temp}

 Id number 2 is generated


In [26]:

final_dict

{'id1': {'Name': 'RAJAT AGARWAAL',
  'contact': '9958168687',
  'email': ['rajatagarwaal30@gmail.com'],
  'skills': {'COMPUTER VISION': 9,
   'DATA SCIENCE': 3,
   'DEEP LEARNING': 3,
   'GITLAB': 6,
   'MACHINE LEARNING': 15,
   'NATURAL LANGUAGE PROCESSING': 1,
   'PIG': 6,
   'PYTHON': 24,
   'SQL': 18,
   'STATISTICAL ANALYSIS': 3,
   'TABLEAU': 6}},
 'id2': {'Name': 'RAJAT AGARWAAL',
  'contact': '9958168687',
  'email': ['rajatagarwaal30@gmail.com'],
  'skills': {'COMPUTER VISION': 9,
   'DATA SCIENCE': 3,
   'DEEP LEARNING': 3,
   'GITLAB': 6,
   'MACHINE LEARNING': 15,
   'NATURAL LANGUAGE PROCESSING': 1,
   'PIG': 6,
   'PYTHON': 24,
   'SQL': 18,
   'STATISTICAL ANALYSIS': 3,
   'TABLEAU': 6}}}

### JSON Output after appending to base file

In [27]:

  
# Serializing json 
json_object_op = json.dumps(final_dict, indent = 4)
  
# Writing to sample.json
with open("base.json", "w") as outfile:
    outfile.write(json_object_op)