In [1]:
!apt-get install tesseract-ocr
!pip install pytesseract
!pip install pdf2image
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 10s (507 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 121918 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [2]:
cv_path = '/content/Park Gopal.pdf'

In [3]:
"""
This script is designed to extract text from a PDF file by converting each page of the PDF into an image and then using OCR (Optical Character Recognition) to read the text from these images. It utilizes the `pdf2image` library to convert PDF pages to images and the `pytesseract` library, a Python wrapper for Google's Tesseract-OCR Engine, to perform the OCR process.
"""

from pdf2image import convert_from_path
import numpy as np
import pytesseract


def extract_data(pdf_path):
    """
    Extracts text from a PDF file by converting each page to an image and then using OCR to read the text.

    Parameters:
    - pdf_path (str): The file path of the PDF from which to extract text.

    Returns:
    - str: A string containing all the extracted text from the PDF.
    """
    # Initialize the variable to stor the extracted text
    extracted_text = ""

    # Converts the PDF to images
    pages = convert_from_path(pdf_path, dpi=300)

    # Loop through the images and extract text
    for page in pages:
        # Automatic page segmentation with OSD
        custom_config = r'--psm 1'
        # Use Tesseract to extract text from the image
        text = pytesseract.image_to_string(np.array(page), config=custom_config)

        # Append the text to the extracted_text variable
        extracted_text += text

    # Return the extracted text
    return extracted_text

In [4]:
text = extract_data(cv_path)

In [5]:
text

'PARK GOPAL\n\n(+62)81230053987 | gogopal@gqmail.com | Padang, Indonesia\n\nA third-year undergraduate student at the Faculty of Engineering, Mayani University, in Informatics\nEngineering. Has an enthusiasm for learning, with analytical and problem-solving skills. Has a strong interest\n\nin developing skills.\nWork Experience\nUI/UX Designer | Kedai Tok Aba Aug 2023 — Present\n\ne Create design interface for web applications.\n\ne Create design interface for mobile applications.\n\nFrontend Developer | SMK Maestro Jan 2021 —Apr 2023\ne Designing a landing page SMK Maestro website using HTML, CSS, and Javascript.\n\nData Intern | PT Bago Go Oct 2022 — Dec 2022\n\ne Performing data cleansing, analytics, visualization, modeling, and deployment of machine learning for\n\nrecommendation system\nEducation Level\nInformatics Engineering, Mayani University 2021 — Present\ne Current GPA: 3.96/4.00\nLanguages\nIndonesia, English, Javanese\nSkills\nHard Skill: Figma, Adobe XD, Miro, HTML, JavaS

In [6]:
"""
This script is designed to preprocessing extract text from a CV file.
"""

import re

def preprocessing(text):
    # Convert text to lowercase
    text = text.lower()

    # Strip leading/trailing whitespace
    text = text.strip()

    # Remove e
    text = re.sub(r'\be\b', '', text)

    # Remove apostrophes and double quotes
    text = text.replace("'", "")
    text = text.replace('"', '')

    # Replace newline characters, semicolons, colons, commas, and vertical bars with spaces
    text = re.sub(r'[\n;,\|]', ' ', text)

    # Replace $1 with S1
    text = re.sub(r'\$1', 'S1', text)

    # Remove "page {number}" references
    text = re.sub(r'\bpage\s\d+\b', '', text, flags=re.IGNORECASE)

    # Fix common email formatting issues
    text = re.sub(r'@gmail\s?com', '@gmail.com', text)
    text = re.sub(r'\s+@', '@', text)
    text = re.sub(r'@gmail\.co\b', '@gmail.com', text)

    # Normalize phone numbers in the +62 format with optional spaces or dashes
    text = re.sub(r'\+62\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the 08xx-xxxx-xxxx format and convert to +62 format
    text = re.sub(r'08(\d{2})-(\d{4})-(\d{4})', r'+628\1\2\3', text)

    # Normalize phone numbers in the (+62) xx-xxxx-xxxx format
    text = re.sub(r'\(\+62\)\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the (62) xx-xxxx-xxxx format
    return re.sub(r'\(62\)\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the +62 format with optional parentheses and spaces or dashes
    text = re.sub(r'\+62\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Convert all +62 numbers to 08 format
    text = re.sub(r'\+62(\d+)', r'08\1', text)

    # Normalize URLs with spaces to proper format
    text = re.sub(r'https\s+(\w+)\s+com', r'https://\1.com', text)

    # Normalize date ranges and formats
    text = re.sub(r'(\d{4})\s*[~=_]\s*(present)', r'\1 - \2', text)
    text = re.sub(r'(\d{4})\s*[~=_]\s*(\w+)\s*(\d{4})?', r'\1 - \2 \3', text)
    text = re.sub(r'(\d{4})\s*[_=]\s*(\w+)', r'\1 - \2', text)

    # Split text into words
    words = text.split()

    # Join words back into a single string
    text = ' '.join(words)

    return text

In [7]:
clean_text = preprocessing(text)

In [8]:
clean_text

'park gopal  +6281230053987   gogopal@gqmail.com   padang  indonesia  a third-year undergraduate student at the faculty of engineering  mayani university  in informatics engineering. has an enthusiasm for learning  with analytical and problem-solving skills. has a strong interest  in developing skills. work experience ui/ux designer   kedai tok aba aug 2023 — present   create design interface for web applications.   create design interface for mobile applications.  frontend developer   smk maestro jan 2021 —apr 2023  designing a landing page smk maestro website using html  css  and javascript.  data intern   pt bago go oct 2022 — dec 2022   performing data cleansing  analytics  visualization  modeling  and deployment of machine learning for  recommendation system education level informatics engineering  mayani university 2021 — present  current gpa: 3.96/4.00 languages indonesia  english  javanese skills hard skill: figma  adobe xd  miro  html  javascript  css  nest.js  python soft ski

In [9]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.2.3-py3-none-any.whl (974 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/974.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/974.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m583.7/974.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/974.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.5-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-sp

In [10]:
import getpass
import os
#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
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
os.environ["HF_TOKEN"] = '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'

··········


In [11]:
!pip install groq

Collecting groq
  Downloading groq-0.8.0-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.4/105.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, groq
Successfully installed groq-0.8.0 h11-0.14.0 httpcore-1.0.5 http

In [19]:
import os
import json
from groq import Groq

def summarize_cv(cv_text):
    # Example JSON template
    json_template = """
    {
        "basic_info": {
            "name": "John Doe",
            "email": "john@gmail.com",
            "phone_number": "082291293453",
            "location": "Jember, Indonesia"
        },
        "work_experience": [
            {
                "job_title": "Data Science",
                "company": "PT. Teknologi Merdeka",
                "location": "Surabaya",
                "start_date": "Dec 2022",
                "end_date": "Jan 2024",
                "job_desc": "Create a model that could predict the weather. Visualize the data. Making presentation for every weeks"
            }
        ],
        "education": [
            {
                "title": "Accounting",
                "institute": "Universitas Jayanegara",
                "location": "Medan",
                "start_date": "Aug 2021",
                "end_date": "Aug 2023",
                "description": "GPA 3.46/4.00"
            }
        ],
        "languages": ["English", "Indonesia", "Sundanese"],
        "skills": ["Python", "Tensorflow", "Communication", "Presentation"],
        "certification": [
            {
                "title": "Learn Data Analytics",
                "issuer": "Gogo",
                "start_date": "2020",
                "expiration_date": "2025"
            }
        ]
    }
    """

    # Initialize the Groq client with your API key
    client = Groq(
        api_key="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",
    )

    # Construct the prompt using the CV text and JSON template
    prompt = f"Summarize this CV:\n{cv_text}\nby following a template like:\n{json_template}. If you don't know just type '-', don't make up an answer"

    # Create the chat completion request
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    # Get the response
    response = chat_completion.choices[0].message.content

    try:
        # Attempt to find the JSON object within the output
        json_start = response.find("{")
        json_end = response.rfind("}") + 1
        json_str = response[json_start:json_end]

        # Try to parse the cleaned JSON string
        json_data = json.loads(json_str)
        return json_data
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        print("Raw output:", response)
        return None

In [29]:
json_output = summarize_cv(clean_text)

In [30]:
json_output

{'basic_info': {'name': 'Park Gopal',
  'email': 'gogopal@gqmail.com',
  'phone_number': '+6281230053987',
  'location': 'Padang, Indonesia'},
 'work_experience': [{'job_title': 'UI/UX Designer',
   'company': 'Kedai Tok ABA',
   'location': '',
   'start_date': 'Aug 2023',
   'end_date': 'present',
   'job_desc': 'Create design interface for web applications and mobile applications'},
  {'job_title': 'Frontend Developer',
   'company': 'SMK Maestro',
   'location': '',
   'start_date': 'Jan 2021',
   'end_date': 'Apr 2023',
   'job_desc': 'Designed a landing page for SMK Maestro website using HTML, CSS, and JavaScript'},
  {'job_title': 'Data Intern',
   'company': 'PT. Bago Go',
   'location': '',
   'start_date': 'Oct 2022',
   'end_date': 'Dec 2022',
   'job_desc': 'Performed data cleansing, analytics, visualization, modeling, and deployment of machine learning for recommendation system'}],
 'education': [{'title': 'Informatics Engineering',
   'institute': 'Mayani University',
   

In [21]:
print(json.dumps(result, indent=4))

{
    "basic_info": {
        "name": "Park Gopal",
        "email": "gogopal@gqmail.com",
        "phone_number": "+6281230053987",
        "location": "Padang, Indonesia"
    },
    "work_experience": [
        {
            "job_title": "UI/UX Designer",
            "company": "Kedai Tok ABA",
            "location": "",
            "start_date": "Aug 2023",
            "end_date": "Present",
            "job_desc": "Create design interface for web and mobile applications"
        },
        {
            "job_title": "Frontend Developer",
            "company": "SMK Maestro",
            "location": "",
            "start_date": "Jan 2021",
            "end_date": "Apr 2023",
            "job_desc": "Designed a landing page for the SMK Maestro website using HTML, CSS, and JavaScript"
        },
        {
            "job_title": "Data Intern",
            "company": "PT Bago Go",
            "location": "",
            "start_date": "Oct 2022",
            "end_date": "Dec 2022",
 

In [22]:
pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=b1b9987e40503e0008fd02774735dfb355f4ef17650558aa9dfc1241e91d2cab
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [23]:
import io

In [38]:
"""
This script is designed to convert the CV JSON into a readable string.
"""

def summarize(json_output):
    """
    Summarizes the CV information provided in the JSON format and formats it into a readable string.

    Parameters:
    - json_output (dict): A dictionary containing CV information. The dictionary should include the following keys:
        - basic_info (dict): Basic information about the individual.
            - name (str): The individual's name.
            - email (str): The individual's email address.
            - phone_number (str): The individual's phone number.
            - location (str): The individual's location.
        - work_experience (list): A list of dictionaries containing work experience information.
            - job_title (str): The job title.
            - company (str): The company name.
            - location (str): The location of the company.
            - start_date (str): The start date of the job.
            - end_date (str): The end date of the job.
            - job_summary (str): A summary of the job responsibilities.
        - education (list): A list of dictionaries containing education information.
            - title (str): The title of the degree or certification.
            - institute (str): The name of the educational institution.
            - location (str): The location of the institution.
            - start_date (str): The start date of the education.
            - end_date (str): The end date of the education.
            - description (str): A description or additional information about the education.
        - languages (list): A list of languages known by the individual.
        - skills (list): A list of skills possessed by the individual.
        - certifications (list): A list of dictionaries containing certification information.
            - title (str): The title of the certification.
            - issuer (str): The organization that issued the certification.
            - start_date (str): The issue date of the certification.
            - expiration_date (str): The expiration date of the certification, if applicable.

    Returns:
    - str: A formatted string containing the summarized CV information.
    """

    buffer = io.StringIO()
    # Accessing basic information
    name = json_output['basic_info']['name']
    email = json_output['basic_info']['email']
    phone_number = json_output['basic_info']['phone_number']
    location = json_output['basic_info']['location']

    buffer.write(f"Personal Information\n")
    buffer.write(f"Name: {name}\n")
    buffer.write(f"Email: {email}\n")
    buffer.write(f"Phone Number: {phone_number}\n")
    buffer.write(f"Location: {location}\n")

    # Accessing work experience
    buffer.write("\nWork Experience:\n")
    for work in json_output['work_experience']:
        job_title = work['job_title']
        company = work['company']
        work_location = work['location']
        start_date = work['start_date']
        end_date = work['end_date']
        job_summary = work['job_desc']

        buffer.write(f"{job_title} | {company}\n")
        buffer.write(f"Location: {work_location}\n")
        buffer.write(f"Duration: {start_date} - {end_date}\n")
        buffer.write(f"Job Summary: {job_summary}\n")

    # Accessing education
    buffer.write("\nEducation:\n")
    for education in json_output['education']:
        title = education['title']
        institute = education['institute']
        edu_location = education['location']
        edu_start_date = education['start_date']
        edu_end_date = education['end_date']
        description = education['description']

        buffer.write(f"{title} | {institute}\n")
        buffer.write(f"Location: {edu_location}\n")
        buffer.write(f"Duration: {edu_start_date} - {edu_end_date}\n")
        buffer.write(f"Description: {description}\n")

    # Accessing languages
    languages = json_output['languages']
    buffer.write("\nLanguages:\n")
    buffer.write(f"{', '.join(languages)}\n")

    # Accessing skills
    skills = json_output['skills']
    buffer.write("\nSkills:\n")
    buffer.write(f"{', '.join(skills)}\n")

    # Accessing certifications
    buffer.write("\nCertifications:\n")
    for cert in json_output['certification']:
        cert_title = cert['title']
        issuer = cert['issuer']
        cert_start_date = cert['start_date']
        expiration_date = cert['expiration_date']

        buffer.write(f"{cert_title} | {issuer}\n")
        buffer.write(f"Issued date: {cert_start_date} - {expiration_date}\n")

    # Get the formatted string
    formatted_cv = buffer.getvalue()
    buffer.close()
    return formatted_cv

In [39]:
"""
This script is designed to convert the string into a PDF file.
"""

from fpdf import FPDF

def string_to_pdf(text, filename):
    """
    Converts a string of text into a PDF file.

    Parameters:
    - text (str): The text content to be included in the PDF.
    - filename (str): The name of the output PDF file.

    Raises:
    - ValueError: If the text parameter is None or empty.
    """

    if not text:
        raise ValueError("The text parameter cannot be None or empty")

    # Create instance of FPDF class
    pdf = FPDF()

    # Add a page
    pdf.add_page()

    # Set font
    pdf.set_font("Arial", size=11)

    # Add a multi-line cell
    pdf.multi_cell(0, 7, text)

    # Save the PDF with the specified filename
    pdf.output(filename)

In [40]:
cv_summary = summarize(json_output)

In [41]:
string_to_pdf(cv_summary, "summary_gopal.pdf")