### JD docx file loading, cleaning, normalisation, preview and output 

In [None]:
from docx import Document
import os
from re import sub

def getFileFormat(path) :
    """
    This function takes path of a file and returns the format of any file
    args : path -> string
    return-type : string
    """
    return os.path.splitext(path)[1].lower()

def loadDocx(path) :
    """
    This function will load the document file and return the content as string
    args : path -> string
    return-type : string
    """
    fileFormat = getFileFormat(path)
    if fileFormat != ".docx" :
        raise ValueError("Invalid file format")
    content = ""
    word = []
    document = Document(path)
    for line in document.paragraphs :
        text = line.text.strip()
        if text :
            word.append(text)
    content = "\n".join(word)
    return content

def preview(content, length=200) :
    """
    This will return a preview of default length of 200 characters
    args : content -> string
           length -> integer
    return-type : string    
    """
    return content[:length]

def cleanText(content) :
    """
    This function will clean the content 
    args : content -> string
    return-type : string
    """
    clean = content.lower()
    clean = sub(r"[^a-zA-Z0-9/:\s]", "", clean)
    clean = sub(r"\s+", " ", clean)
    return clean.strip()

def saveFile(content="", fileName="output") :
    """
    This will save the content in a text file
    args : fileName -> string
           content -> string
    return-type : void
    """
    path = os.path.join("files/" + fileName + ".txt")
    with open(path, "w",encoding="utf-8", newline="") as file:
        file.write(content)
        

try :

    rawContent = loadDocx("files/job_description_sample.docx")
    print(f"----------Preview for Raw content------------\n{preview(rawContent)}\n")

    cleanContent = cleanText(rawContent)
    print(f"--------Preview for Cleaned content------------\n{preview(cleanContent)}")

    saveFile(cleanContent, fileName="output1")
    print("Output saved successfully")
    
except Exception as e :
    print("Error :", e)

----------Preview for Raw content------------
Job Title: Software Developer
Job Description:
We are looking for a Software Developer with strong programming fundamentals.
The candidate should be able to design, develop, and maintain applications.

--------Preview for Cleaned content------------
job title: software developer job description: we are looking for a software developer with strong programming fundamentals the candidate should be able to design develop and maintain applications ski
Output saved successfully
