In [1]:
!apt-get install tesseract-ocr
!pip install pytesseract
!pip install pdf2image
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 11s (442 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 121918 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [3]:
cv_path = '/content/Park Gopal.pdf'

In [4]:
"""
This script is designed to extract text from a PDF file by converting each page of the PDF into an image and then using OCR (Optical Character Recognition) to read the text from these images. It utilizes the `pdf2image` library to convert PDF pages to images and the `pytesseract` library, a Python wrapper for Google's Tesseract-OCR Engine, to perform the OCR process.
"""

from pdf2image import convert_from_path
import numpy as np
import pytesseract


def extract_data(pdf_path):
    """
    Extracts text from a PDF file by converting each page to an image and then using OCR to read the text.

    Parameters:
    - pdf_path (str): The file path of the PDF from which to extract text.

    Returns:
    - str: A string containing all the extracted text from the PDF.
    """
    # Initialize the variable to stor the extracted text
    extracted_text = ""

    # Converts the PDF to images
    pages = convert_from_path(pdf_path, dpi=300)

    # Loop through the images and extract text
    for page in pages:
        # Automatic page segmentation with OSD
        custom_config = r'--psm 1'
        # Use Tesseract to extract text from the image
        text = pytesseract.image_to_string(np.array(page), config=custom_config)

        # Append the text to the extracted_text variable
        extracted_text += text

    # Return the extracted text
    return extracted_text

In [5]:
text = extract_data(cv_path)

In [6]:
text

'PARK GOPAL\n\n(+62)81230053987 | gogopal@gqmail.com | Padang, Indonesia\n\nA third-year undergraduate student at the Faculty of Engineering, Mayani University, in Informatics\nEngineering. Has an enthusiasm for learning, with analytical and problem-solving skills. Has a strong interest\n\nin developing skills.\nWork Experience\nUI/UX Designer | Kedai Tok Aba Aug 2023 — Present\n\ne Create design interface for web applications.\n\ne Create design interface for mobile applications.\n\nFrontend Developer | SMK Maestro Jan 2021 —Apr 2023\ne Designing a landing page SMK Maestro website using HTML, CSS, and Javascript.\n\nData Intern | PT Bago Go Oct 2022 — Dec 2022\n\ne Performing data cleansing, analytics, visualization, modeling, and deployment of machine learning for\n\nrecommendation system\nEducation Level\nInformatics Engineering, Mayani University 2021 — Present\ne Current GPA: 3.96/4.00\nLanguages\nIndonesia, English, Javanese\nSkills\nHard Skill: Figma, Adobe XD, Miro, HTML, JavaS

In [7]:
"""
This script is designed to preprocessing extract text from a CV file.
"""

import re

def preprocessing(text):
    # Convert text to lowercase
    text = text.lower()

    # Strip leading/trailing whitespace
    text = text.strip()

    # Remove e
    text = re.sub(r'\be\b', '', text)

    # Remove apostrophes and double quotes
    text = text.replace("'", "")
    text = text.replace('"', '')

    # Replace newline characters, semicolons, colons, commas, and vertical bars with spaces
    text = re.sub(r'[\n;,\|]', ' ', text)

    # Replace $1 with S1
    text = re.sub(r'\$1', 'S1', text)

    # Remove "page {number}" references
    text = re.sub(r'\bpage\s\d+\b', '', text, flags=re.IGNORECASE)

    # Fix common email formatting issues
    text = re.sub(r'@gmail\s?com', '@gmail.com', text)
    text = re.sub(r'\s+@', '@', text)
    text = re.sub(r'@gmail\.co\b', '@gmail.com', text)

    # Normalize phone numbers in the +62 format with optional spaces or dashes
    text = re.sub(r'\+62\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the 08xx-xxxx-xxxx format and convert to +62 format
    text = re.sub(r'08(\d{2})-(\d{4})-(\d{4})', r'+628\1\2\3', text)

    # Normalize phone numbers in the (+62) xx-xxxx-xxxx format
    text = re.sub(r'\(\+62\)\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the (62) xx-xxxx-xxxx format
    return re.sub(r'\(62\)\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the +62 format with optional parentheses and spaces or dashes
    text = re.sub(r'\+62\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Convert all +62 numbers to 08 format
    text = re.sub(r'\+62(\d+)', r'08\1', text)

    # Normalize URLs with spaces to proper format
    text = re.sub(r'https\s+(\w+)\s+com', r'https://\1.com', text)

    # Normalize date ranges and formats
    text = re.sub(r'(\d{4})\s*[~=_]\s*(present)', r'\1 - \2', text)
    text = re.sub(r'(\d{4})\s*[~=_]\s*(\w+)\s*(\d{4})?', r'\1 - \2 \3', text)
    text = re.sub(r'(\d{4})\s*[_=]\s*(\w+)', r'\1 - \2', text)

    # Split text into words
    words = text.split()

    # Join words back into a single string
    text = ' '.join(words)

    return text

In [8]:
clean_text = preprocessing(text)

In [9]:
clean_text

'park gopal  +6281230053987   gogopal@gqmail.com   padang  indonesia  a third-year undergraduate student at the faculty of engineering  mayani university  in informatics engineering. has an enthusiasm for learning  with analytical and problem-solving skills. has a strong interest  in developing skills. work experience ui/ux designer   kedai tok aba aug 2023 — present   create design interface for web applications.   create design interface for mobile applications.  frontend developer   smk maestro jan 2021 —apr 2023  designing a landing page smk maestro website using html  css  and javascript.  data intern   pt bago go oct 2022 — dec 2022   performing data cleansing  analytics  visualization  modeling  and deployment of machine learning for  recommendation system education level informatics engineering  mayani university 2021 — present  current gpa: 3.96/4.00 languages indonesia  english  javanese skills hard skill: figma  adobe xd  miro  html  javascript  css  nest.js  python soft ski

In [10]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.2.3-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.5-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl (23 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.75-py3-none-any.whl (124 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.9/124.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.0->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting packaging<24.0,>=23.2 (from langchain

In [11]:
import getpass
import os
#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
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
os.environ["HF_TOKEN"] = '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'

··········


In [12]:
!pip install groq

Collecting groq
  Downloading groq-0.8.0-py3-none-any.whl (105 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.4/105.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90

In [38]:
import os
import json
from groq import Groq

def summarize_cv(cv_text):
    # Example JSON template
    json_template = """
    {
        "basic_info": {
            "name": "John Doe",
            "email": "john@gmail.com",
            "phone_number": "082291293453",
            "location": "Jember, Indonesia"
        },
        "work_experience": [
            {
                "job_title": "Data Science",
                "company": "PT. Teknologi Merdeka",
                "location": "Surabaya",
                "start_date": "Dec 2022",
                "end_date": "Jan 2024",
                "job_desc": ["Job_desc_1", "Job_desc_2", "Job_desc_3"]
            }
        ],
        "education": [
            {
                "title": "Accounting",
                "institute": "Universitas Jayanegara",
                "location": "Medan",
                "start_date": "Aug 2021",
                "end_date": "Aug 2023",
                "description": "GPA 3.46/4.00"
            }
        ],
        "languages": ["English", "Indonesia", "Sundanese"],
        "skills": ["Python", "Tensorflow", "Communication", "Presentation"],
        "certification": [
            {
                "title": "Learn Data Analytics",
                "issuer": "Gogo",
                "start_date": "2020",
                "expiration_date": "2025"
            }
        ]
    }
    """

    # Initialize the Groq client with your API key
    client = Groq(
        api_key="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",
    )

    # Construct the prompt using the CV text and JSON template
    prompt = f"Summarize this CV:\n{cv_text}\nby following a template like:\n{json_template}. If you don't know just type 'Unknown', don't make up an answer"

    # Create the chat completion request
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    # Get the response
    response = chat_completion.choices[0].message.content

    try:
        # Attempt to find the JSON object within the output
        json_start = response.find("{")
        json_end = response.rfind("}") + 1
        json_str = response[json_start:json_end]

        # Try to parse the cleaned JSON string
        json_data = json.loads(json_str)
        return json_data
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        print("Raw output:", response)
        return None

In [39]:
json_output = summarize_cv(clean_text)

In [47]:
json_output

{'basic_info': {'name': 'Park Gopal',
  'email': 'gogopal@gqmail.com',
  'phone_number': '+6281230053987',
  'location': 'Padang, Indonesia'},
 'work_experience': [{'job_title': 'UI/UX Designer',
   'company': 'Kedai Tok Aba',
   'location': 'Unknown',
   'start_date': 'Aug 2023',
   'end_date': 'Present',
   'job_desc': ['Create design interface for web applications',
    'Create design interface for mobile applications']},
  {'job_title': 'Frontend Developer',
   'company': 'SMK Maestro',
   'location': 'Unknown',
   'start_date': 'Jan 2021',
   'end_date': 'Apr 2023',
   'job_desc': ['Designing a landing page SMK Maestro website using HTML, CSS, and JavaScript']},
  {'job_title': 'Data Intern',
   'company': 'Pt. Bago Go',
   'location': 'Unknown',
   'start_date': 'Oct 2022',
   'end_date': 'Dec 2022',
   'job_desc': ['Performing data cleansing, analytics, visualization, modeling, and deployment of machine learning for recommendation system']}],
 'education': [{'title': 'Bachelor o

In [41]:
print(json.dumps(json_output, indent=4))

{
    "basic_info": {
        "name": "Park Gopal",
        "email": "gogopal@gqmail.com",
        "phone_number": "+6281230053987",
        "location": "Padang, Indonesia"
    },
    "work_experience": [
        {
            "job_title": "UI/UX Designer",
            "company": "Kedai Tok Aba",
            "location": "Unknown",
            "start_date": "Aug 2023",
            "end_date": "Present",
            "job_desc": [
                "Create design interface for web applications",
                "Create design interface for mobile applications"
            ]
        },
        {
            "job_title": "Frontend Developer",
            "company": "SMK Maestro",
            "location": "Unknown",
            "start_date": "Jan 2021",
            "end_date": "Apr 2023",
            "job_desc": [
                "Designing a landing page SMK Maestro website using HTML, CSS, and JavaScript"
            ]
        },
        {
            "job_title": "Data Intern",
            "

In [31]:
pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=9574283949247138c7115106879221f14f4a49843876be779837a6f5c5141aad
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [32]:
import io

In [145]:
def summarize_to_pdf(json_output, pdf_path):
    class PDF(FPDF):
        def header(self):
            self.set_font('Times', 'B', 14)
            self.cell(0, 7, 'CV Summary', 0, 1, 'C')
            self.ln(5)

        def footer(self):
            self.set_y(-15)
            self.set_font('Times', '', 10)
            self.cell(0, 7, f'{self.page_no()}', 0, 0, 'C')

        def add_section_title(self, title):
            self.set_font('Times', 'BU', 12)
            self.cell(0, 5, title, 0, 1)
            self.ln(3)

        def add_section_subtitle(self, subtitle):
            self.set_font('Times', 'B', 12)
            self.cell(0, 5, subtitle, 0, 1)
            self.ln(2)

        def add_text(self, text):
            self.set_font('Times', '', 12)
            self.multi_cell(0, 5, text)
            self.ln(2)

    pdf = PDF()
    pdf.add_page()

    # Personal Information
    pdf.add_section_title("Personal Information")
    basic_info = json_output['basic_info']
    pdf.add_text(f"Name: {basic_info['name']}")
    pdf.add_text(f"Email: {basic_info['email']}")
    pdf.add_text(f"Phone Number: {basic_info['phone_number']}")
    pdf.add_text(f"Location: {basic_info['location']}")
    pdf.ln(3)

    # Work Experience
    pdf.add_section_title("Work Experience")
    for work in json_output['work_experience']:
      pdf.add_section_subtitle(f"{work['job_title']} | {work['company']}")
      pdf.add_text(f"Location: {work['location']}")
      pdf.add_text(f"Duration: {work['start_date']} - {work['end_date']}")

      # Modified Job Summary formatting
      pdf.add_text("Job Summary:")  # Decreased spacing between lines
      for job_desc in work['job_desc']:
          pdf.cell(5, 5, "-", 0, 0)  # Adjusted spacing for the bullet point
          pdf.multi_cell(0, 6, job_desc)  # Multi cell to allow wrapping text
      pdf.ln(3)  # Increased spacing between job experiences
    pdf.ln(3)

    # Education
    pdf.add_section_title("Education")
    for education in json_output['education']:
        pdf.add_section_subtitle(f"{education['title']} | {education['institute']}")
        pdf.add_text(f"Location: {education['location']}")
        pdf.add_text(f"Duration: {education['start_date']} - {education['end_date']}")
        pdf.add_text(f"Description: {education['description']}")
    pdf.ln(3)

    # Languages
    pdf.add_section_title("Languages")
    pdf.add_text(', '.join(json_output['languages']))
    pdf.ln(3)

    # Skills
    pdf.add_section_title("Skills")
    pdf.add_text(', '.join(json_output['skills']))
    pdf.ln(3)

    # Certifications
    pdf.add_section_title("Certifications")
    for cert in json_output['certification']:
        pdf.add_section_subtitle(f"{cert['title']} | {cert['issuer']}")
        pdf.add_text(f"Issued date: {cert['start_date']} - {cert['expiration_date']}")

    # Save PDF to file
    pdf.output(pdf_path)

In [146]:
summarize_to_pdf(json_output, 'cv_summary.pdf')