In [1]:
from dotenv import load_dotenv
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.ai.formrecognizer import DocumentModelAdministrationClient
from azure.core.credentials import AzureKeyCredential
from docx import Document
from pyresparser import ResumeParser
import json

In [2]:
# Carga las variables de entorno desde el archivo .env
load_dotenv()

# Configurar el cliente de Form Recognizer
endpoint = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
model_id = "model_1"

credential = AzureKeyCredential(key)
document_model_admin_client = DocumentModelAdministrationClient(endpoint, credential)

In [3]:
def ocr_analysis(doc_name):
    try:
        with open(doc_name, "rb") as fd:
            document = fd.read()
        
        document_analysis_client = DocumentAnalysisClient(
            endpoint=endpoint, credential=AzureKeyCredential(key))
        
        poller = document_analysis_client.begin_analyze_document ("prebuilt-read", document)
        result = poller.result()
        
        # Extract text from OCR result
        extracted_text = str(result.content)
        
        return {'status': 'success', 'extracted_text': extracted_text}
    except Exception as e:
        return {'status': 'error', 'message': str(e)}


In [7]:
ocr = ocr_analysis('CV_Javier_Ingles_IA.pdf')
ocr


{'status': 'success',
 'extracted_text': 'machine learning ia diseño gráfico-web\nSoy Javier Inglés Sánchez, tengo una larga carrera en el sector editorial como diseñador gráfico y maquetador. He dado un giro a mi trayectoria centrándome ahora en otro diseño, pero esta vez el relacionado con el apasionante mundo de los datos y la inteligencia artificial.\neducación\nFP2 Artes Gráficas especialidad Diseño Gráfico. BUP y COU en Colegio España Santa\nEulalia. Certificados SEPE: en\nConfección y Publicación\nde páginas web. Técnico en\nGestión Ambiental. Interfaces y experiencia de usuario (UI Y UX)\nCMS-Ecommerce. PHP y MySQL (Bases de datos). Inglés, nivel medio escrito y hablado. Buena ortografía y redacción de textos.\nexperiencia profesional\nskills\nPython BBDD\nJyra/Trello Miro\nJupyter\nGithub\nGit\nSQL Streamlit\nHTML5\nCSS3\nJavaScript Visual Studio C.\nData Analyst-Data Scientist. Actualmente realizando un\nBootcamp de Inteligencia\nArtificial en Factoría F5.\nRealizando proyect

In [8]:
ocr_string = json.dumps(ocr['extracted_text'])
ocr_string

'"machine learning ia dise\\u00f1o gr\\u00e1fico-web\\nSoy Javier Ingl\\u00e9s S\\u00e1nchez, tengo una larga carrera en el sector editorial como dise\\u00f1ador gr\\u00e1fico y maquetador. He dado un giro a mi trayectoria centr\\u00e1ndome ahora en otro dise\\u00f1o, pero esta vez el relacionado con el apasionante mundo de los datos y la inteligencia artificial.\\neducaci\\u00f3n\\nFP2 Artes Gr\\u00e1ficas especialidad Dise\\u00f1o Gr\\u00e1fico. BUP y COU en Colegio Espa\\u00f1a Santa\\nEulalia. Certificados SEPE: en\\nConfecci\\u00f3n y Publicaci\\u00f3n\\nde p\\u00e1ginas web. T\\u00e9cnico en\\nGesti\\u00f3n Ambiental. Interfaces y experiencia de usuario (UI Y UX)\\nCMS-Ecommerce. PHP y MySQL (Bases de datos). Ingl\\u00e9s, nivel medio escrito y hablado. Buena ortograf\\u00eda y redacci\\u00f3n de textos.\\nexperiencia profesional\\nskills\\nPython BBDD\\nJyra/Trello Miro\\nJupyter\\nGithub\\nGit\\nSQL Streamlit\\nHTML5\\nCSS3\\nJavaScript Visual Studio C.\\nData Analyst-Data Scie

In [47]:
import nltk
from nltk.corpus import stopwords
import re

# download stopwords if necessary
nltk.download('stopwords')
nltk.download('punkt')

# define stopwords list
stop_words = set(stopwords.words('english', 'spanish'))

# define function to clean text
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # replace \n with a space
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\\n', ' ', text)
    # remove special characters, punctuation, and emojis
    text = re.sub(r'[^\w\s]', '', text)
    # tokenize into words
    words = nltk.word_tokenize(text)
    # remove stopwords
    words = [word for word in words if word not in stop_words]
    # join the words back into a string
    cleaned_text = ' '.join(words)
    return cleaned_text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DooFromash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DooFromash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
cleaned_text = clean_text(ocr_string)
cleaned_text

'machine learning ia diseu00f1o gru00e1ficoweb soy javier inglu00e9s su00e1nchez tengo una larga carrera en el sector editorial como diseu00f1ador gru00e1fico maquetador dado un giro mi trayectoria centru00e1ndome ahora en otro diseu00f1o pero esta vez el relacionado con el apasionante mundo de los datos la inteligencia artificial educaciu00f3n fp2 artes gru00e1ficas especialidad diseu00f1o gru00e1fico bup cou en colegio espau00f1a santa eulalia certificados sepe en confecciu00f3n publicaciu00f3n de pu00e1ginas web tu00e9cnico en gestiu00f3n ambiental interfaces experiencia de usuario ui ux cmsecommerce php mysql bases de datos inglu00e9s nivel medio escrito hablado buena ortografu00eda redacciu00f3n de textos experiencia profesional skills python bbdd jyratrello miro jupyter github git sql streamlit html5 css3 javascript visual studio c data analystdata scientist actualmente realizando un bootcamp de inteligencia artificial en factoru00eda f5 realizando proyectos en machine learning d

In [90]:
import spacy
from skill_keywords import skills_list

import spacy

def extract_skills(ocr_string):
    # Load the spaCy model
    nlp = spacy.load('en_core_web_sm')
    

    # Parse the text using spaCy
    doc = nlp(ocr_string)

    # Extract the skill keywords that are present in the text
    skills = []
    for token in doc:
        if token.text.lower() in skills_list:
            skills.append(token.text.lower())

    # Remove duplicate skills
    skills = list(set(skills))
    
    return skills

In [91]:
skills = extract_skills(cleaned_text)
skills

['wordpress',
 'sql',
 'github',
 'ubuntu',
 'indesign',
 'editorial',
 'jupyter',
 'matplotlib',
 'photoshop',
 'ux',
 'python',
 'c',
 'illustrator',
 'visual',
 'linux',
 'numpy',
 'javascript',
 'php',
 'mysql',
 'html5',
 'windows',
 'sublime',
 'sketch',
 'ui']

In [92]:

text = ' '.join(skills)
text

'wordpress sql github ubuntu indesign editorial jupyter matplotlib photoshop ux python c illustrator visual linux numpy javascript php mysql html5 windows sublime sketch ui'

In [93]:
import re

from ftfy import fix_text

def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [95]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(skills)

In [96]:
import pandas as pd
stopw  = set(stopwords.words('english'))
df =pd.read_csv('job_final.csv') 

In [97]:
df['test']=df['Job_Description'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word)>2 and word not in (stopw)]))
df['test']

0       About company: Smart Food Safe Solutions Inc. ...
1       Location Bangalore Experience Years Job Descri...
2       Open Systems International, Inc. (OSI) www.osi...
3       About Job Software Testing Engineer Job Descri...
4       Location: Bangalore Experience: 6Years Skills ...
                              ...                        
1919    Skills Qualifications: Years experience Strong...
1920    Job TH10519_13189 Posted on: 29th May, 2019Job...
1921    Job Description spend percent lives buildings....
1922    (Job Number: 1905027) Job Title â€“ Web Develo...
1923    marry design engineering language ways produce...
Name: test, Length: 1924, dtype: object

In [98]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
test = (df['test'].values.astype('U'))

In [99]:

def getNearestN(query):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices


In [100]:
distances, indices = getNearestN(test)
test = list(test) 
matches = []


In [101]:
for i,j in enumerate(indices):
    dist=round(distances[i][0],2)
  
    temp = [dist]
    matches.append(temp)
    
matches = pd.DataFrame(matches, columns=['Match confidence'])

In [102]:
from sklearn.preprocessing import MinMaxScaler

# Invert the distances and then scale them to a 0-100% range
scaler = MinMaxScaler(feature_range=(0, 100))
matches['Match confidence'] = scaler.fit_transform(1 - matches[['Match confidence']])

# Format the Match confidence column to include the % sign and show no decimal places
matches['Match confidence'] = matches['Match confidence'].map('{:.0f}%'.format)


In [103]:
df['match']=matches['Match confidence']
df1=df.sort_values('match', ascending= False)
df1[['Position', 'Company','match']].head(10).reset_index()

Unnamed: 0,index,Position,Company,match
0,185,Python Developer (1512),MSC Software,97%
1,203,Python Developer,MSC Software,97%
2,1417,WordPress Developer (1 to 3 years),Ladybird Web Solution,94%
3,1859,JavaScript Developer,Charmboard,94%
4,608,System Engineer,Collasys Global Services LLP,92%
5,556,Hiring for Web Developer -Javascript,IMC LLP,90%
6,8,Software Testing Engineer,Bloom Consulting Services,90%
7,1557,Consumer Banking Technology - UK Consumer Depo...,Goldman Sachs,9%
8,659,Data Scientist,Adobe,9%
9,1558,JavaScript Developer,Tabtor India,9%
