In [14]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re


In [15]:
#Loading the Docx file
from docx import Document

def load_docx(file_path):
    try:
        return Document(file_path)
    except Exception as e:
        print(f"Error loading document: {e}")
        return None

doc = load_docx('C:\\Users\\panda\\Documents\\Resume.docx')
if not doc:
    exit()

In [16]:
import os
import pandas as pd

def load_resumes(data_path):
    texts = []
    labels = []
    
    # Check if the path exists
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"The path {data_path} does not exist")
    
    # Check if it's a directory
    if not os.path.isdir(data_path):
        raise NotADirectoryError(f"{data_path} is not a directory")
    
    for role in os.listdir(data_path):  # Loop over folders (job roles)
        role_path = os.path.join(data_path, role)
        
        # Skip if not a directory
        if not os.path.isdir(role_path):
            continue
            
        for file in os.listdir(role_path):  # Loop over files inside each folder
            file_path = os.path.join(role_path, file)
            
            # Skip if not a file
            if not os.path.isfile(file_path):
                continue
                
            # Skip non-text files (optional - adjust extensions as needed)
            if not file.endswith(('.txt', '.pdf', '.docx', '.doc')):
                continue
                
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    texts.append(text)    # Store the resume text
                    labels.append(role)   # Store the folder name as label
            except UnicodeDecodeError:
                # Try with a different encoding if utf-8 fails
                try:
                    with open(file_path, 'r', encoding='latin-1') as f:
                        text = f.read()
                        texts.append(text)
                        labels.append(role)
                except Exception as e:
                    print(f"Could not read file {file_path}: {e}")
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    
    if not texts:
        print("Warning: No resume files were loaded")
        
    return pd.DataFrame({'resume': texts, 'label': labels})

# Call the function
try:
    df = load_resumes('E:\\Academic\\NLP\\Resumes')
    print(f"Successfully loaded {len(df)} resumes across {df['label'].nunique()} job categories")
except Exception as e:
    print(f"Failed to load resumes: {e}")

Successfully loaded 25 resumes across 7 job categories


In [17]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r'\W+', ' ', text)           # Remove special characters
    text = text.lower()                        # Lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['cleaned'] = df['resume'].apply(preprocess)
df.head(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\panda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,resume,label,cleaned
0,"PK�����!�#ûp��""���[Content_Types]....",Software engineer,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
1,"PK�����!�#ûp��""���[Content_Types]....",Software engineer,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
2,"PK�����!�#ûp��""���[Content_Types]....",Software engineer,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
3,"PK�����!�#ûp��""���[Content_Types]....",Software engineer,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
4,"PK�����!�#ûp��""���[Content_Types]....",Data Analyst,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
5,"PK�����!�#ûp��""���[Content_Types]....",Data Analyst,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
6,"PK�����!�#ûp��""���[Content_Types]....",Data Analyst,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
7,PK�����!�d-e�����[Content_Types]....,Data Analyst,pk e content_types xml ënã0 e hücä jü² 5í ç q ...
8,"PK�����!�#ûp��""���[Content_Types]....",Marketing Manager,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...
9,"PK�����!�#ûp��""���[Content_Types]....",Marketing Manager,pk ûp content_types xml ënã0 e hücä jü² 5í ç q...


In [18]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(max_features=1000)
BOW_matrix = v.fit_transform(df['cleaned'])
print(v.get_feature_names_out())  
print(BOW_matrix.toarray())  

['03ïã' '0a²' '0bv' '0ª8s' '0º' '0þaé6' '0þtc' '14éæ' '1i' '1iö' '1xöpâ'
 '1ää' '1ì' '1ím' '1õöjh' '2r' '2î' '2ù' '2ü' '3bg¾' '3sngáw' '3yçé3cö'
 '3²àèx2üô1r' '3á' '3ãäh' '3ù' '4eçù' '4iúãúiïæq' '4ndõb' '4onb' '4¾ýñ'
 '4ßþù' '4ãn' '4ë' '4ü' '50' '5kc5y' '5q' '5ì' '5í' '5ðº' '5òhòk' '6hy'
 '6kkx' '6ng' '6ye4e' '6ê' '6ï' '6ó' '6þµ' '7jun' '7v' '7z' '7õ'
 '7õkpúllá' '8g' '8áòqsl' '8æôq¹' '8é' '8õ' '9fý0' '9àé' '9òm' '9ý' '_mµ'
 '_rels' '_y' '_²' '_áo' '_ó' '_ôx' '_öj' '_ù' 'a1' 'aa' 'acf' 'aj' 'am1'
 'aolëµj' 'ap' 'app' 'as¹' 'aª' 'a½' 'aß' 'aåïzaí' 'aç½' 'aè' 'aë' 'aí'
 'að½' 'aðäº' 'aó' 'aôðözsw' 'aü' 'b0z' 'bc' 'bjtçö' 'buwêþö' 'bys' 'b¼'
 'bà' 'bã' 'bå' 'bç' 'bçr' 'bé¾' 'bê' 'bíð' 'bò' 'bòþ' 'bùq' 'bú' 'c2û'
 'c3' 'cb' 'ci' 'content_types' 'core' 'cs' 'custom' 'cy' 'cz' 'cìü¾bã³'
 'cí' 'cðû' 'có' 'da' 'dc' 'djcî' 'dk' 'docprops' 'document' 'dsrcý' 'du'
 'du¾têoj¾k' 'dw' 'dx' 'dyos' 'd²âe' 'd³' 'dèmdý' 'dê' 'dí' 'dî' 'dð'
 'dðó' 'dñ' 'dö' 'döå' 'dù' 'dý' 'e_' 'ea' 'ef' 'ei' 'ej' 'ek' '

In [19]:
#TF-ID
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # top 1000 important words
X = vectorizer.fit_transform(df['cleaned'])
y = df['label']
print(vectorizer.get_feature_names_out()) # Shows unique words
print(X.toarray()) # Displays TF-IDF values


['03ïã' '0a²' '0bv' '0ª8s' '0º' '0þaé6' '0þtc' '14éæ' '1i' '1iö' '1xöpâ'
 '1ää' '1ì' '1ím' '1õöjh' '2r' '2î' '2ù' '2ü' '3bg¾' '3sngáw' '3yçé3cö'
 '3²àèx2üô1r' '3á' '3ãäh' '3ù' '4eçù' '4iúãúiïæq' '4ndõb' '4onb' '4¾ýñ'
 '4ßþù' '4ãn' '4ë' '4ü' '50' '5kc5y' '5q' '5ì' '5í' '5ðº' '5òhòk' '6hy'
 '6kkx' '6ng' '6ye4e' '6ê' '6ï' '6ó' '6þµ' '7jun' '7v' '7z' '7õ'
 '7õkpúllá' '8g' '8áòqsl' '8æôq¹' '8é' '8õ' '9fý0' '9àé' '9òm' '9ý' '_mµ'
 '_rels' '_y' '_²' '_áo' '_ó' '_ôx' '_öj' '_ù' 'a1' 'aa' 'acf' 'aj' 'am1'
 'aolëµj' 'ap' 'app' 'as¹' 'aª' 'a½' 'aß' 'aåïzaí' 'aç½' 'aè' 'aë' 'aí'
 'að½' 'aðäº' 'aó' 'aôðözsw' 'aü' 'b0z' 'bc' 'bjtçö' 'buwêþö' 'bys' 'b¼'
 'bà' 'bã' 'bå' 'bç' 'bçr' 'bé¾' 'bê' 'bíð' 'bò' 'bòþ' 'bùq' 'bú' 'c2û'
 'c3' 'cb' 'ci' 'content_types' 'core' 'cs' 'custom' 'cy' 'cz' 'cìü¾bã³'
 'cí' 'cðû' 'có' 'da' 'dc' 'djcî' 'dk' 'docprops' 'document' 'dsrcý' 'du'
 'du¾têoj¾k' 'dw' 'dx' 'dyos' 'd²âe' 'd³' 'dèmdý' 'dê' 'dí' 'dî' 'dð'
 'dðó' 'dñ' 'dö' 'döå' 'dù' 'dý' 'e_' 'ea' 'ef' 'ei' 'ej' 'ek' '

In [20]:

#Training the model with resumes in logistic regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

#Evaluating the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                   precision    recall  f1-score   support

     Data Analyst       0.00      0.00      0.00       0.0
 Graphic Designer       0.00      0.00      0.00       1.0
Marketing Manager       0.00      0.00      0.00       2.0
       Operations       0.00      0.00      0.00       0.0
  Product Manager       0.00      0.00      0.00       1.0
Software engineer       0.00      0.00      0.00       1.0

         accuracy                           0.00       5.0
        macro avg       0.00      0.00      0.00       5.0
     weighted avg       0.00      0.00      0.00       5.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
#Predict New Resume
def predict_resume(text):
    cleaned = preprocess(text)
    vector = vectorizer.transform([cleaned])
    return model.predict(vector)[0]

In [22]:

print("Give the skills you have:")  
new_resume = input()
print("Entered Skills:", new_resume)
print("Predicted Role:", predict_resume(new_resume))

Give the skills you have:
Entered Skills: Tableau
Predicted Role: Data Analyst
