In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("resumes_extracted.csv")

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Text preprocessing function
def preprocess_text(text):
    try:
        text = str(text).lower()
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\d+', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(tokens)
    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

# Apply preprocessing
df["cleaned_resume"] = df["resume_text"].apply(preprocess_text)
df.to_csv("cleaned_resumes.csv", index=False)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df["cleaned_resume"])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv("tfidf_resumes.csv", index=False)

# Train-test split
X = tfidf_df
y = df["profession"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))

# Train and tune SVM Model
param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best SVM Parameters:", grid_search.best_params_)
best_svm = grid_search.best_estimator_

# Evaluation
y_pred_svm = best_svm.predict(X_test)
print("Tuned SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Tuned SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

# Save model and vectorizer
joblib.dump(best_svm, "svm_model.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")
print("Model and vectorizer saved successfully.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VE00YM679\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VE00YM679\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VE00YM679\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Naive Bayes Accuracy: 0.5513078470824949
Naive Bayes Classification Report:
                         precision    recall  f1-score   support

            ACCOUNTANT       0.51      0.83      0.63        24
              ADVOCATE       0.57      0.50      0.53        24
           AGRICULTURE       1.00      0.08      0.14        13
               APPAREL       1.00      0.21      0.35        19
                  ARTS       0.67      0.10      0.17        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.82      0.58      0.68        24
               BANKING       0.73      0.48      0.58        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.32      0.83      0.46        24
                  CHEF       0.89      0.67      0.76        24
          CONSTRUCTION       0.62      0.59      0.60        22
            CONSULTANT       0.50      0.04      0.08        23
              DESIGNER    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
