# Import Libaries 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [None]:
df1 = pd.read_csv('Resume.csv')

In [None]:
df2 = pd.read_csv('UpdatedResumeDataSet.csv')

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
# Create a new DataFrame with only the required columns and rename 'Resume_str' to 'Resume'
df = df1[['Category', 'Resume_str']].rename(columns={'Resume_str': 'Resume'})

In [None]:
df.head()

In [None]:
DF = pd.merge(df, df2, on='Category', how ='left')

In [None]:
DF = pd.concat([df, df2],ignore_index =True)

In [None]:
DF.to_csv('newCSV.csv',index=False)

In [None]:
# Display the first few rows of the updated DataFrame
DF.head()

In [None]:
DF.shape

# Exploring Categories

In [None]:
DF['Category']

In [None]:
DF['Category'].value_counts()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='Category', data=DF)
plt.xticks(rotation=90)
plt.show()

In [None]:
DF['Category'].unique()

In [None]:
counts = DF['Category'].value_counts()
labels = DF['Category'].unique()
plt.figure(figsize=(15,10))
plt.pie(counts, labels=labels, autopct='%1.1f%%', shadow = True, colors = plt.cm.plasma(np.linspace(0,1,3)))
plt.show()

# Exploring Resume

In [None]:
DF

In [None]:
DF['Category'][0]

In [None]:
DF['Resume'][0]

# Balance Classes (Categories)

In [None]:
# Check the original category distribution
print("Original Category Distribution:")
print(DF['Category'].value_counts())

# Get the largest category size (i.e., the category with the maximum number of entries)
max_size = DF['Category'].value_counts().max()

# Perform oversampling
balanced_DF = DF.groupby('Category').apply(lambda x: x.sample(max_size, replace=True)).reset_index(drop=True)

# Shuffle the dataset to avoid any order bias
DF = balanced_DF.sample(frac=1).reset_index(drop=True)

# Check the balanced category distribution
print("\nBalanced Category Distribution (After Oversampling):")
print(DF['Category'].value_counts())

# Cleaning Data

      Url's
      hashtags
      mentions
      special letters
      punctuations 

In [None]:


import re
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText



In [None]:
cleanResume("my #### $ #  #noorsaeed webiste like is this http://heloword and access it @gmain.com")

In [None]:
DF.columns = DF.columns.str.strip()  # Removes leading/trailing spaces
DF.columns = DF.columns.str.lower()  # Makes all columns lowercase
print(DF.columns)

In [None]:
DF['resume'] = DF['resume'].apply(lambda x: cleanResume(x))  #  correct after lowercasing

In [None]:
DF['resume'] 

In [None]:
DF['resume'] [0]

# Words into categorical values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
le.fit(DF['category'])
DF['category'] = le.transform(DF['category'])

In [None]:
DF

In [None]:
df.Category.unique()

In [None]:
DF.category.unique()

# Vectorization 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words ='english')


tfidf.fit(DF['resume'])
requredTaxt = tfidf.transform(DF['resume'])

In [None]:
requredTaxt

# Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(requredTaxt,DF['category'] , test_size = 0.2, random_state = 42)

In [None]:
X_train.shape

In [None]:
X_test.shape

# Now let's train the model and print the classification report:

In [None]:
from sklearn.linear_model import LogisticRegression

clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
ypred = clf.predict(X_test)
print(accuracy_score(y_test, ypred))


In [None]:
# 3. Train RandomForestClassifier
rf_model = OneVsRestClassifier(RandomForestClassifier())
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("\nRandomForestClassifier Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_rf)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_rf)}")

In [None]:
ypred

# Save files

In [None]:
import pickle
pickle.dump(tfidf,open('tfidf.pkl','wb'))
pickle.dump(svc_model, open('clf.pkl', 'wb'))
pickle.dump(le, open("encoder.pkl",'wb'))

# Prediction System

In [None]:
# Function to predict the category of a resume
def pred(input_resume):
    # Preprocess the input text (e.g., cleaning, etc.)
    cleaned_text = cleanResume(input_resume) 

    # Vectorize the cleaned text using the same TF-IDF vectorizer used during training
    vectorized_text = tfidf.transform([cleaned_text])
    
    # Convert sparse matrix to dense
    vectorized_text = vectorized_text.toarray()

    # Prediction
    predicted_category = clf.predict(vectorized_text)

    # get name of predicted category
    predicted_category_name = le.inverse_transform(predicted_category)

    return predicted_category_name[0]  # Return the category name