In [2]:
import pandas as pd
import re
import spacy
import fitz  
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import joblib
import numpy as np
from classMap import class_mapping
# nltk.download('stopwords')
# nltk.download('punkt')


nlp = spacy.load("en_core_web_lg")

In [51]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\W+', ' ', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

In [52]:
def lemmatize_tokens(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

In [53]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [54]:
def preprocess_text(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    tokens = lemmatize_tokens(tokens)
    return " ".join(tokens)

In [55]:
model = joblib.load('Nlp/model-class.pkl')
vectorizer = joblib.load('Nlp/vectorizer-class.pkl')


In [56]:


def predict_formation(pdf_path, model, vectorizer):
    text = extract_text_from_pdf(pdf_path)
    processed_text = preprocess_text(text)
    X = vectorizer.transform([processed_text])
    prediction = model.predict(X)
    probabilities = model.predict_proba(X)
    return prediction,probabilities


def top_probabilities(probabilities):
    top_indices = np.argsort(probabilities[0])[::-1][:5]
    result = []
    for idx in top_indices:
        class_name = [key for key, value in class_mapping.items() if value == idx][0]
        prob_percent = probabilities[0][idx] * 100
        result.append(f"{class_name}: {prob_percent:.2f}%") 
    return result


In [57]:
senior_model = joblib.load('model_seniority.pkl')
vectorizer_senior = joblib.load('vectorizer_seniority.pkl')
from Nlp.classMapSeniority import class_mapping_seniority

def predict_seniority(pdf_path, model, vectorizer):
    text = extract_text_from_pdf(pdf_path)
    processed_text = preprocess_text(text)
    X = vectorizer.transform([processed_text])
    prediction = model.predict(X)
    class_name = [key for key, value in class_mapping_seniority.items() if value == prediction]
    return class_name




In [61]:
pdf_path = 'path_file.pdf'
senior = predict_seniority(pdf_path,senior_model,vectorizer_senior)
result,probabilities = predict_formation(pdf_path,model,vectorizer)

print(top_probabilities(probabilities))

print(senior)


['BUSINESS-DEVELOPMENT: 62.71%', 'AVIATION: 12.56%', 'BANKING: 6.47%', 'BPO: 1.50%', 'AGRICULTURE: 1.42%']
['Senior']
