In [None]:
!apt-get install tesseract-ocr
!pip install pytesseract
!pip install pdf2image
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 3s (1,598 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 121918 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [None]:
cv_path = 'https://storage.googleapis.com/dicoding-capstone-bucket/cv.pdf'
file_name = cv_path.split("/")[-1]

In [None]:
from google.cloud import storage
from pdf2image import convert_from_bytes
import numpy as np
import pytesseract
import io

def download_pdf_from_storage(bucket_name, file_name):
    """
    Downloads a PDF file from Google Cloud Storage.

    Parameters:
    - bucket_name (str): The name of the GCS bucket.
    - file_name (str): The name of the PDF file within the GCS bucket.

    Returns:
    - bytes: The content of the PDF file.
    """
    # Initialize the GCS client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Get the blob (PDF file)
    blob = bucket.blob(file_name)

    # Download the PDF content as bytes
    pdf_bytes = blob.download_as_bytes()

    return pdf_bytes

def extract_data_from_pdf(pdf_bytes):
    """
    Extracts text from a PDF file by converting each page to an image and then using OCR to read the text.

    Parameters:
    - pdf_bytes (bytes): The byte content of the PDF from which to extract text.

    Returns:
    - str: A string containing all the extracted text from the PDF.
    """
    # Initialize the variable to store the extracted text
    extracted_text = ""

    # Convert the PDF bytes to images
    pages = convert_from_bytes(pdf_bytes, dpi=300)

    # Loop through the images and extract text
    for page in pages:
        # Automatic page segmentation with OSD
        custom_config = r'--psm 1'
        # Use Tesseract to extract text from the image
        text = pytesseract.image_to_string(np.array(page), config=custom_config)

        # Append the text to the extracted_text variable
        extracted_text += text

    # Return the extracted text
    return extracted_text

In [None]:
# Unduh file PDF dari penyimpanan cloud
pdf_bytes = download_pdf_from_storage('dicoding-capstone-bucket', file_name)

# Ekstrak teks dari file PDF yang diunduh
extracted_text = extract_data_from_pdf(pdf_bytes)

In [None]:
extracted_text

'+1 (970) 333-3833\nmatthew.eliot@mail.com\n\nhttps:\n\nlinkedin.com/mattheweliot\n\nMATTHEW ELIOT\n\n \n\nSummary\n\n \n\nSenior Web Developer specializing in front end development. Experienced with all stages of the\ndevelopment cycle for dynamic web projects. Well-versed in numerous programming\nlanguages including HTML5, PHP OOP, JavaScript, CSS, MySQL. Strong background in project\nmanagement and customer relations.\n\nSkill Highlights\n\n \n\ne Project management e Creative design\n\ne Strong decision maker e Innovative\n\ne Complex problem solver e Service-focused\nExperience\n\n \n\nWeb Developer - 09/2015 to 05/2019\nLuna Web Design, New York\n\nCooperate with designers to create clean interfaces and simple, intuitive interactions\nand experiences.\n\nDevelop project concepts and maintain optimal workflow.\n\nWork with senior developer to manage large, complex design projects for corporate\nclients.\n\nComplete detailed programming and development tasks for front end public an

In [None]:
"""
This script is designed to preprocessing extract text from a CV file.
"""

import re

def preprocessing(text):
    # Convert text to lowercase
    text = text.lower()

    # Strip leading/trailing whitespace
    text = text.strip()

    # Remove e
    text = re.sub(r'\be\b', '', text)

    # Remove apostrophes and double quotes
    text = text.replace("'", "")
    text = text.replace('"', '')

    # Replace newline characters, semicolons, colons, commas, and vertical bars with spaces
    text = re.sub(r'[\n;,\|]', ' ', text)

    # Replace $1 with S1
    text = re.sub(r'\$1', 'S1', text)

    # Remove "page {number}" references
    text = re.sub(r'\bpage\s\d+\b', '', text, flags=re.IGNORECASE)

    # Fix common email formatting issues
    text = re.sub(r'@gmail\s?com', '@gmail.com', text)
    text = re.sub(r'\s+@', '@', text)
    text = re.sub(r'@gmail\.co\b', '@gmail.com', text)

    # Normalize phone numbers in the +62 format with optional spaces or dashes
    text = re.sub(r'\+62\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the 08xx-xxxx-xxxx format and convert to +62 format
    text = re.sub(r'08(\d{2})-(\d{4})-(\d{4})', r'+628\1\2\3', text)

    # Normalize phone numbers in the (+62) xx-xxxx-xxxx format
    text = re.sub(r'\(\+62\)\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the (62) xx-xxxx-xxxx format
    return re.sub(r'\(62\)\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Normalize phone numbers in the +62 format with optional parentheses and spaces or dashes
    text = re.sub(r'\+62\s?(\d{2,3})[ -]?(\d{3,4})[ -]?(\d{4,5})', r'+62\1\2\3', text)

    # Convert all +62 numbers to 08 format
    text = re.sub(r'\+62(\d+)', r'08\1', text)

    # Normalize URLs with spaces to proper format
    text = re.sub(r'https\s+(\w+)\s+com', r'https://\1.com', text)

    # Normalize date ranges and formats
    text = re.sub(r'(\d{4})\s*[~=_]\s*(present)', r'\1 - \2', text)
    text = re.sub(r'(\d{4})\s*[~=_]\s*(\w+)\s*(\d{4})?', r'\1 - \2 \3', text)
    text = re.sub(r'(\d{4})\s*[_=]\s*(\w+)', r'\1 - \2', text)

    # Split text into words
    words = text.split()

    # Join words back into a single string
    text = ' '.join(words)

    return text

In [None]:
clean_text = preprocessing(extracted_text)

In [None]:
clean_text

'+1 (970) 333-3833 matthew.eliot@mail.com  https:  linkedin.com/mattheweliot  matthew eliot     summary     senior web developer specializing in front end development. experienced with all stages of the development cycle for dynamic web projects. well-versed in numerous programming languages including html5  php oop  javascript  css  mysql. strong background in project management and customer relations.  skill highlights      project management  creative design   strong decision maker  innovative   complex problem solver  service-focused experience     web developer - 09/2015 to 05/2019 luna web design  new york  cooperate with designers to create clean interfaces and simple  intuitive interactions and experiences.  develop project concepts and maintain optimal workflow.  work with senior developer to manage large  complex design projects for corporate clients.  complete detailed programming and development tasks for front end public and internal websites as well as challenging back-en

In [None]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.2.3-py3-none-any.whl (974 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/974.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/974.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/974.0 kB[0m [31m17.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.5-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl (23 kB)
Collecting langsmith<0.2.0,>=

In [None]:
import getpass
import os
#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
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
os.environ["HF_TOKEN"] = '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'

··········


In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.8.0-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.4/105.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h11, httpcore, httpx, groq
Successfully installed groq-0.8.0 h11-0.14.0 httpcore-1.0.

In [51]:
import os
import json
from groq import Groq

def summarize_cv(cv_text):
    # Example JSON template
    json_template = """
    {
      "basic_info": {
          "name": "candidate full name",
          "email": "candidate email address",
          "phone_number": "candidate phone number",
          "location": "candidate location"
      },
      "work_experience": [
          {
              "job_title": "candidate's job title in the company worked at",
              "company": "name of company worked at",
              "location": "location of company worked at",
              "start_date": "start date in the company worked at",
              "end_date": "end date in the company worked at",
              "job_desc": ["candidate's job description 1", "candidate's job description 2", "candidate's job description 3"]
          }
      ],
      "education": [
          {
              "title": "candidate education title",
              "institute": "candidate education institute",
              "location": "location of candidate education institute",
              "start_date": "start date of candidate education",
              "end_date": "end date of candidate education",
              "description": "candidate education description"
          }
      ],
      "languages": ["language the candidate speaks 1", "language the candidate speaks 2", "language the candidate speaks 3"],
      "skills": ["hard skills, soft skills, and tools mastered by the candidate 1", "hard skills, soft skills, and tools mastered by the candidate 2", "hard skills, soft skills, and tools mastered by the candidate 3", "hard skills, soft skills, and tools mastered by the candidate 4"],
      "certification": [
          {
              "title": "candidate certification title",
              "issuer": "candidate certification issuer",
              "start_date": "start date of candidate certification",
              "expiration_date": "end date of candidate certification"
          }
      ]
    }
    """

    # Initialize the Groq client with your API key
    client = Groq(
        api_key="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",
    )

    # Construct the prompt using the CV text and JSON template
    prompt = f"Summarize your CV:\n{cv_text}\following a template like:\n{json_template}. Keep the JSON keys, so if there is a JSON key that doesn't have a value fill in 'null'. If there is info you don't know, just type 'Unknown', don't create answers outside the document."

    # Create the chat completion request
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    # Get the response
    response = chat_completion.choices[0].message.content

    try:
        # Attempt to find the JSON object within the output
        json_start = response.find("{")
        json_end = response.rfind("}") + 1
        json_str = response[json_start:json_end]

        # Try to parse the cleaned JSON string
        json_data = json.loads(json_str)
        return json_data
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        print("Raw output:", response)
        return None

In [52]:
json_output = summarize_cv(clean_text)

In [53]:
json_output

{'basic_info': {'name': 'Matthew Eliot',
  'email': 'matthew.eliot@mail.com',
  'phone_number': '+1 (970) 333-3833',
  'location': 'Unknown'},
 'work_experience': [{'job_title': 'Web Developer',
   'company': 'Luna Web Design',
   'location': 'New York',
   'start_date': '09/2015',
   'end_date': '05/2019',
   'job_desc': ['Cooperated with designers to create clean interfaces and simple intuitive interactions',
    'Developed project concepts and maintained optimal workflow',
    'Completed detailed programming and development tasks for front-end and back-end development']}],
 'education': [{'title': 'Bachelor of Science: Computer Information Systems',
   'institute': 'Columbia University',
   'location': 'New York',
   'start_date': '2014',
   'end_date': '2014',
   'description': None}],
 'languages': ['Unknown'],
 'skills': ['HTML5',
  'PHP',
  'OO',
  'JavaScript',
  'CSS',
  'MySQL',
  'Project Management',
  'Creative Design',
  'Strong Decision Making',
  'Innovative',
  'Comple

In [None]:
print(json.dumps(json_output, indent=4))

{
    "basic_info": {
        "name": "Matthew Eliot",
        "email": "matthew.eliot@mail.com",
        "phone_number": "+1 (970) 333-3833",
        "location": "Unknown"
    },
    "work_experience": [
        {
            "job_title": "Web Developer",
            "company": "Luna Web Design",
            "location": "New York",
            "start_date": "09/2015",
            "end_date": "05/2019",
            "job_desc": [
                "Created clean interfaces and simple, intuitive interactions and experiences"
            ]
        }
    ],
    "education": [
        {
            "title": "Bachelor of Science: Computer Information Systems",
            "institute": "Columbia University",
            "location": "NY",
            "start_date": "2014",
            "end_date": "2014",
            "description": "Null"
        }
    ],
    "languages": [
        "HTML5",
        "PHP",
        "OOP",
        "JavaScript",
        "CSS",
        "MySQL"
    ],
    "skills": [
  

In [None]:
!pip install fpdf



In [None]:
import io

In [None]:
from fpdf import FPDF
from google.cloud import storage
import os

def summarize_to_pdf(json_output, file_name):
    class PDF(FPDF):
        def header(self):
            self.set_font('Times', 'B', 14)
            self.cell(0, 7, 'CV Summary', 0, 1, 'C')
            self.ln(5)

        def footer(self):
            self.set_y(-15)
            self.set_font('Times', '', 10)
            self.cell(0, 7, f'{self.page_no()}', 0, 0, 'C')

        def add_section_title(self, title):
            self.set_font('Times', 'BU', 12)
            self.cell(0, 5, title, 0, 1)
            self.ln(3)

        def add_section_subtitle(self, subtitle):
            self.set_font('Times', 'B', 12)
            self.cell(0, 5, subtitle, 0, 1)
            self.ln(2)

        def add_text(self, text):
            self.set_font('Times', '', 12)
            self.multi_cell(0, 5, text)
            self.ln(2)

    pdf = PDF()
    pdf.add_page()

    # Personal Information
    pdf.add_section_title("Personal Information")
    basic_info = json_output['basic_info']
    pdf.add_text(f"Name: {basic_info['name']}")
    pdf.add_text(f"Email: {basic_info['email']}")
    pdf.add_text(f"Phone Number: {basic_info['phone_number']}")
    pdf.add_text(f"Location: {basic_info['location']}")
    pdf.ln(3)

    # Work Experience
    pdf.add_section_title("Work Experience")
    for work in json_output['work_experience']:
      pdf.add_section_subtitle(f"{work['job_title']} | {work['company']}")
      pdf.add_text(f"Location: {work['location']}")
      pdf.add_text(f"Duration: {work['start_date']} - {work['end_date']}")

      # Modified Job Summary formatting
      pdf.add_text("Job Summary:")  # Decreased spacing between lines
      for job_desc in work['job_desc']:
          pdf.cell(5, 5, "-", 0, 0)  # Adjusted spacing for the bullet point
          pdf.multi_cell(0, 6, job_desc)  # Multi cell to allow wrapping text
      pdf.ln(3)  # Increased spacing between job experiences
    pdf.ln(3)

    # Education
    pdf.add_section_title("Education")
    for education in json_output['education']:
        pdf.add_section_subtitle(f"{education['title']} | {education['institute']}")
        pdf.add_text(f"Location: {education['location']}")
        pdf.add_text(f"Duration: {education['start_date']} - {education['end_date']}")
        pdf.add_text(f"Description: {education['description']}")
    pdf.ln(3)

    # Languages
    pdf.add_section_title("Languages")
    pdf.add_text(', '.join(json_output['languages']))
    pdf.ln(3)

    # Skills
    pdf.add_section_title("Skills")
    pdf.add_text(', '.join(json_output['skills']))
    pdf.ln(3)

    # Certifications
    pdf.add_section_title("Certifications")
    for cert in json_output['certification']:
        pdf.add_section_subtitle(f"{cert['title']} | {cert['issuer']}")
        pdf.add_text(f"Issued date: {cert['start_date']} - {cert['expiration_date']}")

    summary_pdf_name = f"{file_name[:-4]}_summary.pdf"

    # Simpan PDF ke lokasi sementara
    temp_pdf_path = '/tmp/' + summary_pdf_name
    pdf.output(temp_pdf_path)

    # Upload file PDF ke Google Cloud Storage
    upload_pdf_to_gcs(temp_pdf_path, summary_pdf_name)

    # Hapus file temporary PDF
    os.remove(temp_pdf_path)

    # Dapatkan URL untuk file PDF di GCS
    pdf_url = get_gcs_file_url(summary_pdf_name)

    # Kembalikan URL PDF summary
    return pdf_url

def upload_pdf_to_gcs(file_path, file_name):
    # Inisialisasi Google Cloud Storage client
    storage_client = storage.Client()

    # Dapatkan bucket dari Google Cloud Storage
    bucket = storage_client.get_bucket("dicoding-capstone-bucket")

    # Upload file ke GCS
    blob = bucket.blob(file_name)
    blob.upload_from_filename(file_path)

def get_gcs_file_url(file_name):
    # Dapatkan URL untuk file di Google Cloud Storage
    return f"https://storage.googleapis.com/dicoding-capstone-bucket/{file_name}"


In [None]:
pdf_url = summarize_to_pdf(json_output, file_name)
print("URL PDF summary:", pdf_url)

URL PDF summary: https://storage.googleapis.com/dicoding-capstone-bucket/cv_summary.pdf
