In [1]:
import pandas as pd
import nltk
import re
import spacy
import os
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import logging
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

# Download NLTK resources
# nltk.download('stopwords')
# nltk.download('punkt')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')



nlp = spacy.load("en_core_web_lg")

In [8]:

def clean_text(text):
    text = text.lower()  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\W+', ' ', text)  
    text = re.sub(r'\s+', ' ', text) 
    return text

def lemmatize_tokens(tokens, model):
    doc = model(" ".join(tokens))
    return [token.lemma_ for token in doc]

logging.info("Reading dataset.csv")
df = pd.read_csv('Nlp/jupyter_notebook/BIG_DATASET.csv', delimiter=';')

2024-07-08 09:13:54,893 - INFO - Reading dataset.csv


In [9]:

if not os.path.exists('preprocessed_text.csv'):
    stop_words = set(stopwords.words('english'))
    logging.info("Cleaning and preprocessing text")
    df['text'] = df['text'].apply(clean_text)

    logging.info("tokenizing")
    df['tokens'] = df['text'].apply(word_tokenize)

    logging.info("Removing StopWords")
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
    
    logging.info("lemantizing")
    df['tokens'] = df['tokens'].apply(lambda x: lemmatize_tokens(x, nlp))
    logging.info("token-lambda")
    df['tokens'] = df['tokens'].apply(lambda x: ' '.join(x))
    df.to_csv('preprocessed_text.csv', index=False)
    logging.info("Preprocessed text saved to preprocessed_text.csv")
else:
    logging.info("Preprocessed text already exists, skipping preprocessing step")
    df = pd.read_csv('preprocessed_text.csv')

2024-07-08 09:13:57,574 - INFO - Cleaning and preprocessing text
2024-07-08 09:13:59,752 - INFO - tokenizando
2024-07-08 09:14:08,162 - INFO - Removendo as StopWords
2024-07-08 09:14:08,436 - INFO - lemantizando
2024-07-08 09:20:34,079 - INFO - token-lambda
2024-07-08 09:20:35,166 - INFO - Preprocessed text saved to preprocessed_text.csv


In [10]:
df

Unnamed: 0,ID,text,class,class_number,seniority_number,Unnamed: 5,tokens
0,2025,accountant professional summary skills work hi...,ACCOUNTANT,0,1,,accountant professional summary skill work his...
1,2026,accountant i summary a business management gra...,ACCOUNTANT,0,1,,accountant summary business management graduat...
2,2027,investment accountant career focus accomplishe...,ACCOUNTANT,0,1,,investment accountant career focus accomplish ...
3,2028,staff accountant summary professional accounta...,ACCOUNTANT,0,2,,staff accountant summary professional accounta...
4,2029,accountant summary senior level it finance man...,ACCOUNTANT,0,2,,accountant summary senior level finance manage...
...,...,...,...,...,...,...,...
3578,132,b financial manager resume sample ndorothy wil...,MANAGMENT,25,2,,b financial manager resume sample ndorothy wil...
3579,133,b,MANAGMENT,25,0,,b
3580,136,b linda a rinaldi n n n cell nlrinaldi comcast...,MANAGMENT,25,2,,b linda rinaldi n n n cell nlrinaldi comcast n...
3581,144,b assistant general manager resume n nrobert b...,MANAGMENT,25,1,,b assistant general manager resume n nrobert b...


In [14]:
def train(df):
    X_train, X_test, y_train, y_test = train_test_split(df['tokens'], df['seniority_number'], test_size=0.1, stratify=df['seniority_number'], random_state=None)
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    model = xgb.XGBClassifier()
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    return model, vectorizer, accuracy

logging.info("Training and exporting model and vector")
model, vectorizer, accuracy_model = train(df)
logging.info("Model accuracy: {:.2f}".format(accuracy_model))

2024-07-08 09:22:42,403 - INFO - Training and exporting model and vector
2024-07-08 09:23:10,681 - INFO - Model accuracy: 0.74
