In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load datasete4    
df = pd.read_csv('UpdatedResumeDataSet.csv')

# Check original category distribution
print("Original Category Distribution:")
print(df['Category'].value_counts())

# Oversampling
max_size = df['Category'].value_counts().max()
balanced_df = df.groupby('Category').apply(lambda x: x.sample(max_size, replace=True)).reset_index(drop=True)
df = balanced_df.sample(frac=1).reset_index(drop=True)
print("\nBalanced Category Distribution (After Oversampling):")
print(df['Category'].value_counts())

# Text cleaning function
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', ' ', cleanText)
    cleanText = re.sub('[%s]' % re.escape("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText

df['Resume'] = df['Resume'].apply(lambda x: cleanResume(x))

# Encode categories
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'])

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(df['Resume'])
requredText = tfidf.transform(df['Resume'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(requredText, df['Category'], test_size=0.2, random_state=42)
X_train = X_train.toarray()
X_test = X_test.toarray()

# Model training
svc_model = OneVsRestClassifier(SVC())
svc_model.fit(X_train, y_train)

# Save models
# pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
# pickle.dump(svc_model, open('clf.pkl', 'wb'))
# pickle.dump(le, open('encoder.pkl', 'wb'))

# Prediction function
def pred(input_resume):
    cleaned_text = cleanResume(input_resume)
    vectorized_text = tfidf.transform([cleaned_text]).toarray()
    predicted_category = svc_model.predict(vectorized_text)
    return le.inverse_transform(predicted_category)[0]

  cleanText = re.sub('http\S+\s', ' ', txt)
  cleanText = re.sub('#\S+\s', ' ', cleanText)
  cleanText = re.sub('@\S+', ' ', cleanText)
  cleanText = re.sub('\s+', ' ', cleanText)
  cleanText = re.sub('http\S+\s', ' ', txt)
  cleanText = re.sub('#\S+\s', ' ', cleanText)
  cleanText = re.sub('@\S+', ' ', cleanText)
  cleanText = re.sub('\s+', ' ', cleanText)


KeyboardInterrupt: 