In [5]:
import pandas as pd
import re
import nltk
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load dataset
df = pd.read_excel("./consolidated_resumes.xlsx")


# Text preprocessing
nltk.download("stopwords")
nltk.download("wordnet")
lemma = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

df['Cleaned Text'] = df['Extracted Text'].astype(str).apply(lambda x:
    ' '.join([lemma.lemmatize(word) for word in re.sub('[^a-zA-Z]', ' ', x).lower().split() if word not in stop_words])
)

# Initialize and fit CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Cleaned Text'])

# Save the vectorizer
with open("vectorizer1.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

# Encode labels
encoder = OrdinalEncoder()
y = encoder.fit_transform(df[['Category']])

# Save the encoder
with open("label_encoder.pkl", "wb") as enc_file:
    pickle.dump(encoder, enc_file)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
def prediction(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Training Accuracy:", model.score(X_train, y_train))
    print("Testing Accuracy:", model.score(X_test, y_test))
    return model

# Train and save the model
model = prediction(XGBClassifier(reg_lambda=1, learning_rate=0.1, max_depth=3, n_estimators=70))
with open("clf.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

print("Vectorizer, Encoder, and Model saved successfully.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training Accuracy: 1.0
Testing Accuracy: 1.0
Vectorizer, Encoder, and Model saved successfully.
