In [1]:
from dotenv import load_dotenv
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.ai.formrecognizer import DocumentModelAdministrationClient
from azure.core.credentials import AzureKeyCredential
from docx import Document
from pyresparser import ResumeParser
import json

In [2]:
# Carga las variables de entorno desde el archivo .env
load_dotenv()

# Configurar el cliente de Form Recognizer
endpoint = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
model_id = "model_1"

credential = AzureKeyCredential(key)
document_model_admin_client = DocumentModelAdministrationClient(endpoint, credential)

In [3]:
# import json

# def ocr_analysis(doc_name):

#     with open(doc_name + '.pdf', "rb") as fd:
#         document = fd.read()
    
#     document_analysis_client = DocumentAnalysisClient(
#         endpoint=endpoint, credential=AzureKeyCredential(key))
    
#     poller = document_analysis_client.begin_analyze_document ("prebuilt-read", document)
#     result = poller.result()
    
#     # Save dictionary to JSON file
#     with open(doc_name + '.json', 'w') as json_file:
#         json.dump(result.to_dict(), json_file)
    
#     # Load JSON file and extract content to text file
#     with open(doc_name + '.json', 'r') as json_file:
#         pdf = json.load(json_file)
    
#     with open(doc_name + '.txt', 'w') as text_file:
#         # Write the contents of the dictionary to the text file
#         text_file.write(str(pdf['content']))

    

In [4]:
def ocr_analysis(doc_name):
    try:
        with open(doc_name, "rb") as fd:
            document = fd.read()
        
        document_analysis_client = DocumentAnalysisClient(
            endpoint=endpoint, credential=AzureKeyCredential(key))
        
        poller = document_analysis_client.begin_analyze_document ("prebuilt-read", document)
        result = poller.result()
        
        # Extract text from OCR result
        extracted_text = str(result.content)
        
        return {'status': 'success', 'extracted_text': extracted_text}
    except Exception as e:
        return {'status': 'error', 'message': str(e)}


In [5]:
ocr = ocr_analysis('CV.pdf')
ocr


{'status': 'success',
 'extracted_text': 'Kalash Jindal\n3rd year B Tech, Computer Science and Engineering Feroze Gandhi Institute of Engineering and Technology\nEDUCATION\n8.71/10.0 CGPA (Up to 2nd year) Feroze Gandhi Institute Of Engineering and Technology, Raebareli — B tech Computer Science and Engineering 2017 - Till Date\n84.4/100 Percentage Agarwal Public Inter College, Sitapur — Intermediate 2016\n298, Thomsanganj, Sitapur Uttar Pradesh, India +91-7839453651 jindalkalash298@gmail.com www.linkedin.com/in/kalashj16/\nSKILLS\nPython, Statistics, Data Handling, Data Visualization, Linear Algebra, Neural Networks, Transfer Learning, Feature Extraction, Deep Learning, Sci-kit Learn, Keras, OpenCV, SQLite, Html, CSS, GUI using Pyqt module, Git GitHub, C, C++, etc.\nHOBBIES Table Tennis, Cooking, Reading Books, Web Surfing, etc.\n90.1/100 Percentage Agarwal Public Inter College, Sitapur — High School 2014\nTRAINING\nMachine Learning Master Course — Coding Blocks https://online.codingbl

In [6]:
ocr_string = json.dumps(ocr['extracted_text'])
ocr_string

'"Kalash Jindal\\n3rd year B Tech, Computer Science and Engineering Feroze Gandhi Institute of Engineering and Technology\\nEDUCATION\\n8.71/10.0 CGPA (Up to 2nd year) Feroze Gandhi Institute Of Engineering and Technology, Raebareli \\u2014 B tech Computer Science and Engineering 2017 - Till Date\\n84.4/100 Percentage Agarwal Public Inter College, Sitapur \\u2014 Intermediate 2016\\n298, Thomsanganj, Sitapur Uttar Pradesh, India +91-7839453651 jindalkalash298@gmail.com www.linkedin.com/in/kalashj16/\\nSKILLS\\nPython, Statistics, Data Handling, Data Visualization, Linear Algebra, Neural Networks, Transfer Learning, Feature Extraction, Deep Learning, Sci-kit Learn, Keras, OpenCV, SQLite, Html, CSS, GUI using Pyqt module, Git GitHub, C, C++, etc.\\nHOBBIES Table Tennis, Cooking, Reading Books, Web Surfing, etc.\\n90.1/100 Percentage Agarwal Public Inter College, Sitapur \\u2014 High School 2014\\nTRAINING\\nMachine Learning Master Course \\u2014 Coding Blocks https://online.codingblocks.

In [53]:
import nltk
from nltk.corpus import stopwords
import re

# download stopwords if necessary
nltk.download('stopwords')

# define stopwords list
stop_words = set(stopwords.words('english', 'spanish'))

# define function to clean text
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    # remove special characters, punctuation, and emojis
    text = re.sub(r'[^\w\s]','',text)
    # tokenize into words
    words = nltk.word_tokenize(text)
    # remove stopwords
    words = [word for word in words if word not in stop_words]
    # join the words back into a string
    cleaned_text = ' '.join(words)
    return cleaned_text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DooFromash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
cleaned_text = clean_text(ocr_string)

In [50]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import emoji

import spacy

nlp = spacy.load("en_core_web_sm")

def extract_skills(text):
    skills = []
    doc = nlp(text.lower())
    for chunk in doc.noun_chunks:
        if chunk.root.pos_ == "NOUN":
            for token in chunk:
                if token.pos_ == "ADJ" or token.pos_ == "NOUN":
                    skills.append(token.text)
    return list(set(skills))



In [57]:

skills = extract_skills(cleaned_text)
skills

['global',
 'person',
 'evaluation',
 'crossentropyntext',
 'course',
 'accuracy',
 'loss',
 'images',
 'patients',
 'feature',
 'decision',
 'places',
 'engineering',
 'classifier',
 'dataset',
 'lead',
 'humans',
 'correct',
 'element',
 'works',
 'representation',
 'reviewers',
 'diabetic',
 'highest',
 'word2vec',
 'certificate',
 'question',
 'principle',
 'game',
 'country',
 'model',
 'generation',
 'classification',
 'people',
 'work',
 'tech',
 'categorical',
 'daily',
 'different',
 'visualization',
 'concept',
 'features',
 'similar',
 'data',
 'vector',
 'network',
 'embeddingvector',
 'database',
 'trees',
 'management',
 'diabetics',
 'analysis',
 'support',
 'dream',
 'rating',
 'table',
 'extracting',
 'machine',
 'movie',
 'scikit',
 'training',
 'prediction',
 'pole',
 'piece',
 'line',
 'networks',
 'regression',
 'wordsnbollywood',
 'master',
 'angle',
 'accuracynseparating',
 'certain',
 'negative',
 'deep',
 'classesnodd',
 'horse',
 'student',
 'score',
 'air',
 

In [69]:
import re

from ftfy import fix_text

def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [86]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(skills)

In [87]:
import pandas as pd
stopw  = set(stopwords.words('english'))
df =pd.read_csv('job_final.csv') 

In [88]:
df['test']=df['Job_Description'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word)>2 and word not in (stopw)]))
df['test']

0       About company: Smart Food Safe Solutions Inc. ...
1       Location Bangalore Experience Years Job Descri...
2       Open Systems International, Inc. (OSI) www.osi...
3       About Job Software Testing Engineer Job Descri...
4       Location: Bangalore Experience: 6Years Skills ...
                              ...                        
1919    Skills Qualifications: Years experience Strong...
1920    Job TH10519_13189 Posted on: 29th May, 2019Job...
1921    Job Description spend percent lives buildings....
1922    (Job Number: 1905027) Job Title â€“ Web Develo...
1923    marry design engineering language ways produce...
Name: test, Length: 1924, dtype: object

In [89]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
test = (df['test'].values.astype('U'))

In [90]:

def getNearestN(query):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices


In [91]:
distances, indices = getNearestN(test)
test = list(test) 
matches = []


In [92]:
distances, indices = getNearestN(test)
test = list(test) 
matches = []


In [93]:
df['match']=matches['Match confidence']
df1=df.sort_values('match')
df1[['Position', 'Company','Location','match']].head(10).reset_index()

TypeError: list indices must be integers or slices, not str