In [1]:
#!pip install pywin32 
#!pip install pdf2docx

In [2]:
# Convert all the doc files to docx files
import os
import win32com.client

def convert_doc_to_docx(doc_path):
    """Converts a .doc file to .docx in the same directory."""
    try:
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False  # Run in the background
        
        docx_path = doc_path + "x"  # Append 'x' to get .docx
        if os.path.exists(docx_path):
            print(f"Skipped (already exists): {docx_path}")
            return

        print(f"Converting: {doc_path} -> {docx_path}")
        doc = word.Documents.Open(doc_path)
        doc.SaveAs(docx_path, FileFormat=16)  # FileFormat=16 = .docx
        doc.Close()
        word.Quit()
        
        print(f"Converted successfully: {docx_path}")
    except Exception as e:
        print(f"Error converting {doc_path}: {e}")
    os.remove(doc_path)


def process_folders(root_folder):
    """Scans for .doc files in all subfolders and converts them."""
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.lower().endswith(".doc") and not filename.lower().endswith(".docx"):
                doc_path = os.path.join(foldername, filename)
                convert_doc_to_docx(doc_path)

# Example usage: Change this to your root folder containing all 4 subfolders
root_folder = r"C:\Resumes_Docx"  # Update this path
process_folders(root_folder)

In [3]:
# Convert all the pdfs into docx
import os
from pdf2docx import Converter

def convert_pdf_to_docx(pdf_path):
    """Converts a .pdf file to .docx in the same directory."""
    try:
        docx_path = os.path.splitext(pdf_path)[0] + ".docx"  # Change extension to .docx
        
        if os.path.exists(docx_path):
            print(f"Skipped (already exists): {docx_path}")
            return
        
        print(f"Converting: {pdf_path} -> {docx_path}")
        cv = Converter(pdf_path)
        cv.convert(docx_path, start=0, end=None)
        cv.close()
        
        print(f"Converted successfully: {docx_path}")
    except Exception as e:
        print(f"Error converting {pdf_path}: {e}")
    os.remove(pdf_path)

def process_folders(root_folder):
    """Scans for .pdf files in all subfolders and converts them."""
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(foldername, filename)
                convert_pdf_to_docx(pdf_path)

# Example usage: Change this to your root folder containing all 4 subfolders
root_folder = r"C:\Resumes_Docx" # Update this path
process_folders(root_folder)

In [4]:
# Getting the data frame having path 
import os
import pandas as pd

def get_all_file_paths(folder_path):
    """Returns a DataFrame containing all file paths in the given folder (including subdirectories)."""
    file_paths = []

    # Walk through directory and subdirectories
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))  # Construct full file path

    # Create a DataFrame
    df = pd.DataFrame(file_paths, columns=["file_path"])
    return df

# Example usage
folder_path = "C:\Resumes_Docx"# Replace with your folder path
df = get_all_file_paths(folder_path)

# Display DataFrame
df

  folder_path = "C:\Resumes_Docx"# Replace with your folder path
[INFO] NumExpr defaulting to 8 threads.


Unnamed: 0,file_path
0,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...
1,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...
2,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...
3,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...
4,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...
...,...
75,C:\Resumes_Docx\workday\Sri Krishna S_Hexaware...
76,C:\Resumes_Docx\workday\Srikanth-Hexaware.docx
77,C:\Resumes_Docx\workday\SSKumar_Hexaware.docx
78,C:\Resumes_Docx\workday\Venkateswarlu B_Hexawa...


In [5]:
import re
from docx import Document

def extract_text_from_docx(docx_path):
    """Extracts raw text from a .docx file, including tables, and removes extra spaces/newlines/tabs."""
    try:
        doc = Document(docx_path)
        
        # Extract text from paragraphs
        text = [para.text.strip() for para in doc.paragraphs if para.text.strip()]

        # Extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
                if row_text:
                    text.append(" | ".join(row_text))  # Join table cells with "|"

        # Join text with space and clean up excessive whitespace
        cleaned_text = " ".join(text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces/newlines/tabs with a single space
        
        return cleaned_text.strip()
    
    except Exception as e:
        return f"Error reading DOCX: {e}"


In [6]:
df['text'] = df['file_path'].apply(extract_text_from_docx)
df.head()

Unnamed: 0,file_path,text
0,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Anubhav Kumar Singh To work in a globally comp...
1,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Profile Summary: 7+ years of experience in imp...
2,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,PeopleSoft Database Administrator Gangareddy P...
3,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Murali Experience Summary I have 6 years of ex...
4,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,"Priyanka Ramadoss 61/46, MountPleasant, Coonoo..."


In [7]:
df['target'] = df['file_path'].apply(lambda x: x.split('\\')[2])

In [8]:
df.head()

Unnamed: 0,file_path,text,target
0,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Anubhav Kumar Singh To work in a globally comp...,Peoplesoft Resume
1,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Profile Summary: 7+ years of experience in imp...,Peoplesoft Resume
2,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,PeopleSoft Database Administrator Gangareddy P...,Peoplesoft Resume
3,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Murali Experience Summary I have 6 years of ex...,Peoplesoft Resume
4,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,"Priyanka Ramadoss 61/46, MountPleasant, Coonoo...",Peoplesoft Resume


In [16]:
df.tail()

Unnamed: 0,file_path,text,target
75,C:\Resumes_Docx\workday\Sri Krishna S_Hexaware...,Workday Integration Consultant Name : Sri Kris...,3
76,C:\Resumes_Docx\workday\Srikanth-Hexaware.docx,Seeking suitable positions in Workday HCM as T...,3
77,C:\Resumes_Docx\workday\SSKumar_Hexaware.docx,WORKDAY | HCM | FCM Name : Kumar S.S Role : Wo...,3
78,C:\Resumes_Docx\workday\Venkateswarlu B_Hexawa...,Venkateswarlu.B Workday Consultant Having 5.3 ...,3
79,C:\Resumes_Docx\workday\Vinay Kumar_Hexaware.docx,Vinay kumar .v Workday Functional Consultant E...,3


In [9]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['target']=le.fit_transform(df['target'])
df.head()

Unnamed: 0,file_path,text,target
0,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Anubhav Kumar Singh To work in a globally comp...,0
1,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Profile Summary: 7+ years of experience in imp...,0
2,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,PeopleSoft Database Administrator Gangareddy P...,0
3,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Murali Experience Summary I have 6 years of ex...,0
4,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,"Priyanka Ramadoss 61/46, MountPleasant, Coonoo...",0


In [10]:
df.to_csv('data_set_csv.csv', index=False)

In [11]:
df.head(10)

Unnamed: 0,file_path,text,target
0,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Anubhav Kumar Singh To work in a globally comp...,0
1,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Profile Summary: 7+ years of experience in imp...,0
2,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,PeopleSoft Database Administrator Gangareddy P...,0
3,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Murali Experience Summary I have 6 years of ex...,0
4,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,"Priyanka Ramadoss 61/46, MountPleasant, Coonoo...",0
5,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,PROFILE SUMMARY I have overall 6.8 years’ expe...,0
6,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,PEOPLESOFT Administrator SRINIVAS.K Experience...,0
7,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,PeopleSoft Admin VARKALA VIKAS Career Objectiv...,0
8,C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...,Vinod Akkala PeopleSoft DBA Admin. Professiona...,0
9,C:\Resumes_Docx\Peoplesoft Resume\PeopleSoft D...,PeopleSoft Admin/PeopleSoft DBA Ganesh Alladi ...,0


In [12]:
df.tail(10)

Unnamed: 0,file_path,text,target
70,C:\Resumes_Docx\workday\Naresh Babu Cherukuri_...,Name : Naresh Babu Cherukuri Objective: To tak...,3
71,C:\Resumes_Docx\workday\P V Sai Krishna_ Hexaw...,VENKATA SAIKRISHNA Workday Consultant PROFSSIO...,3
72,C:\Resumes_Docx\workday\Punugoti Swetha_Hexawa...,3.3 years of IT experience as Workday HCM Tech...,3
73,C:\Resumes_Docx\workday\RahulM_Hexaware.docx,Rahul (Techno Functional Consultant) Professio...,3
74,C:\Resumes_Docx\workday\RameshP_Hexaware.docx,Ramesh A (Workday HCM Consultant) PROFESSIONAL...,3
75,C:\Resumes_Docx\workday\Sri Krishna S_Hexaware...,Workday Integration Consultant Name : Sri Kris...,3
76,C:\Resumes_Docx\workday\Srikanth-Hexaware.docx,Seeking suitable positions in Workday HCM as T...,3
77,C:\Resumes_Docx\workday\SSKumar_Hexaware.docx,WORKDAY | HCM | FCM Name : Kumar S.S Role : Wo...,3
78,C:\Resumes_Docx\workday\Venkateswarlu B_Hexawa...,Venkateswarlu.B Workday Consultant Having 5.3 ...,3
79,C:\Resumes_Docx\workday\Vinay Kumar_Hexaware.docx,Vinay kumar .v Workday Functional Consultant E...,3


In [13]:
df.describe

<bound method NDFrame.describe of                                             file_path  \
0   C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...   
1   C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...   
2   C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...   
3   C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...   
4   C:\Resumes_Docx\Peoplesoft Resume\Peoplesoft A...   
..                                                ...   
75  C:\Resumes_Docx\workday\Sri Krishna S_Hexaware...   
76     C:\Resumes_Docx\workday\Srikanth-Hexaware.docx   
77      C:\Resumes_Docx\workday\SSKumar_Hexaware.docx   
78  C:\Resumes_Docx\workday\Venkateswarlu B_Hexawa...   
79  C:\Resumes_Docx\workday\Vinay Kumar_Hexaware.docx   

                                                 text  target  
0   Anubhav Kumar Singh To work in a globally comp...       0  
1   Profile Summary: 7+ years of experience in imp...       0  
2   PeopleSoft Database Administrator Gangareddy P...       0  
3   Murali Experience Sum

In [14]:
df.columns

Index(['file_path', 'text', 'target'], dtype='object')

In [15]:
df.info().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_path  80 non-null     object
 1   text       80 non-null     object
 2   target     80 non-null     int32 
dtypes: int32(1), object(2)
memory usage: 1.7+ KB


AttributeError: 'NoneType' object has no attribute 'sum'