In [58]:
import os
import json
import codecs

def extract_and_clean_json(directory):
    """
    Processes JSON files in a directory, removes unnecessary ```json ... ``` blocks,
    retains only the valid JSON content, and cleans the surrounding text.
    """
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with codecs.open(file_path, "r", encoding="utf-8", errors="replace") as file:
                        content = file.read()
                        json_start = content.find("```json")
                        json_end = content.rfind("```")
                        if json_start != -1 and json_end != -1:
                            json_content = content[json_start + 7:json_end].strip()
                            content = json_content
                        else:
                            content = content

                    with codecs.open(file_path, "w", encoding="utf-8") as file:
                        file.write(content)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
                    continue

# Example usage
directory_path = "../output_json"
extract_and_clean_json(directory_path)

print("JSON content has been cleaned and saved back to files.")


JSON content has been cleaned and saved back to files.


In [52]:
import os
import json
import codecs

def extract_and_clean_json(directory):
    """
    Processes JSON files in a directory, removes unnecessary ```json ... ``` blocks,
    retains only the valid JSON content, and cleans the surrounding text, including comments.
    """
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with codecs.open(file_path, "r", encoding="utf-8", errors="replace") as file:
                        content = file.read()
                        json_start = content.find("```json")
                        json_end = content.rfind("```")
                        if json_start != -1 and json_end != -1:
                            json_content = content[json_start + 7:json_end].strip()
                            # Remove comments
                            json_content = re.sub(r"//.*?(\n|$)|/\*.*?\*/", "", json_content, flags=re.DOTALL)
                            # Retain only valid JSON content
                            content = json_content
                        else:
                            # If no ```json block found, use the content as is
                            content = content

                    # Save the cleaned JSON content back to the file
                    with codecs.open(file_path, "w", encoding="utf-8") as file:
                        file.write(content)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
                    continue

# Example usage
directory_path = "../output_json"
extract_and_clean_json(directory_path)

print("JSON content has been cleaned, including comments, and saved back to files.")


JSON content has been cleaned, including comments, and saved back to files.


In [25]:
import pandas as pd
import json
file_path = '..\\output_json\\ACCOUNTANT\\10554236.json'

# Load JSON from a file
with open(file_path, 'r') as file:
    cv_data = json.load(file)

# Extract 'work' section and combine them into a paragraph
work_paragraph = ""
for work in cv_data.get('work', []):
    work_summary = work.get('summary', '')
    work_highlights = ', '.join(work.get('highlights', []))
    if work_summary or work_highlights:
        work_paragraph += f"Position: {work.get('position', '')}\n"
        work_paragraph += f"Company: {work.get('name', '')}\n"
        work_paragraph += f"Summary: {work_summary}\n"
        if work_highlights:
            work_paragraph += f"Highlights: {work_highlights}\n"
        work_paragraph += "\n"

# Display the paragraph
print(work_paragraph)


Position: Accountant
Company: Company Name
Summary: In this position as an Accountant assigned to the Defense Enterprise Accounting and Management System (DEAMS) ERO I was responsible for identifying and resolving issues affecting the DEAMS General Ledger.
Highlights: Served on a tiger team which identified and resolved General Ledger postings in DEAMS totaling $360B in accounting adjustments., In collaboration with DFAS Europe, developed an automated tool that identified duplicate obligations.

Position: Resource Advisor
Company: Company Name
Summary: In this position as Resource Advisor for the 1st Air Communications Operation Squadron (1ACOS) I was responsible for providing financial advice and decision support to the Commander.

Position: Staff Accountant
Company: Company Name
Summary: In my position as the Staff Accountant for HQ USAFE I was responsible for providing accounting and financial oversight and advice to customers throughout the Command in support of the USAFE Comptroll

In [26]:
import pandas as pd
import json
import os

# Get the filename from the path
file_path = '..\\output_json\\ACCOUNTANT\\10554236.json'
file_name = os.path.basename(file_path)

# Extract the id from the filename
cv_id = os.path.splitext(file_name)[0]  # Assuming the format is something like '10554236.json'

# Load JSON from the file
try:
    with open(file_path, 'r') as file:
        cv_data = json.load(file)

    # Extract 'work' section and combine them into a paragraph
    work_paragraph = ""
    for work in cv_data.get('work', []):
        work_summary = work.get('summary', '')
        work_highlights = ', '.join(work.get('highlights', []))
        if work_summary or work_highlights:
            work_paragraph += f"Position: {work.get('position', '')}\n"
            work_paragraph += f"Company: {work.get('name', '')}\n"
            work_paragraph += f"Summary: {work_summary}\n"
            if work_highlights:
                work_paragraph += f"Highlights: {work_highlights}\n"
            work_paragraph += "\n"

    # Create a DataFrame with 'id' and 'work' columns
    df = pd.DataFrame({'id': [cv_id], 'work': [work_paragraph]})

    # Display the DataFrame
    print(df)

except FileNotFoundError:
    print(f"File {file_path} not found.")


         id                                               work
0  10554236  Position: Accountant\nCompany: Company Name\nS...


In [28]:
import json

file_path = '..\\output_json\\ACCOUNTANT\\10554236.json'

# Load JSON from a file
with open(file_path, 'r') as file:
    cv_data = json.load(file)

# Extract 'education' section and combine them into a paragraph
education_paragraph = ""
for education in cv_data.get('education', []):
    institution = education.get('institution', '')
    area = education.get('area', '')
    study_type = education.get('studyType', '')
    start_date = education.get('startDate', '')
    end_date = education.get('endDate', '')
    courses = ', '.join(education.get('courses', []))

    education_paragraph += f"Institution: {institution}\n"
    education_paragraph += f"Area of Study: {area}\n"
    education_paragraph += f"Degree: {study_type}\n"
    education_paragraph += f"Start Date: {start_date}\n"
    education_paragraph += f"End Date: {end_date}\n"
    if courses:
        education_paragraph += f"Courses: {courses}\n"
    education_paragraph += "\n"

# Display the paragraph
print(education_paragraph)


Institution: Northern Maine Community College
Area of Study: Accounting
Degree: Associate
Start Date: 
End Date: 1994




In [29]:
import pandas as pd
import json
import os

# Get the filename from the path
file_path = '..\\output_json\\ACCOUNTANT\\10554236.json'
file_name = os.path.basename(file_path)

# Extract the id from the filename
cv_id = os.path.splitext(file_name)[0]  # Assuming the format is something like '10554236.json'

# Load JSON from the file
try:
    with open(file_path, 'r') as file:
        cv_data = json.load(file)

    # Extract 'work' section and combine them into a paragraph
    work_paragraph = ""
    for work in cv_data.get('work', []):
        work_summary = work.get('summary', '')
        work_highlights = ', '.join(work.get('highlights', []))
        if work_summary or work_highlights:
            work_paragraph += f"Position: {work.get('position', '')}\n"
            work_paragraph += f"Company: {work.get('name', '')}\n"
            work_paragraph += f"Summary: {work_summary}\n"
            if work_highlights:
                work_paragraph += f"Highlights: {work_highlights}\n"
            work_paragraph += "\n"

    # Extract 'education' section and combine them into a paragraph
    education_paragraph = ""
    for education in cv_data.get('education', []):
        institution = education.get('institution', '')
        area = education.get('area', '')
        study_type = education.get('studyType', '')
        start_date = education.get('startDate', '')
        end_date = education.get('endDate', '')
        courses = ', '.join(education.get('courses', []))

        education_paragraph += f"Institution: {institution}\n"
        education_paragraph += f"Area of Study: {area}\n"
        education_paragraph += f"Degree: {study_type}\n"
        education_paragraph += f"Start Date: {start_date}\n"
        education_paragraph += f"End Date: {end_date}\n"
        if courses:
            education_paragraph += f"Courses: {courses}\n"
        education_paragraph += "\n"

    # Create a DataFrame with 'id', 'work', and 'education' columns
    df = pd.DataFrame({
        'id': [cv_id],
        'work': [work_paragraph],
        'education': [education_paragraph]
    })

    # Display the DataFrame
    print(df)

except FileNotFoundError:
    print(f"File {file_path} not found.")


         id                                               work  \
0  10554236  Position: Accountant\nCompany: Company Name\nS...   

                                           education  
0  Institution: Northern Maine Community College\...  


In [None]:
import pandas as pd
import json
import os

# Get the filename from the path
file_path = '..\\output_json\\ACCOUNTANT\\10554236.json'
file_name = os.path.basename(file_path)

# Extract the id from the filename
cv_id = os.path.splitext(file_name)[0]  # Assuming the format is something like '10554236.json'

# Load JSON from the file
try:
    with open(file_path, 'r') as file:
        cv_data = json.load(file)

    # Extract 'work' section and combine them into a paragraph
    work_paragraph = ""
    for work in cv_data.get('work', []):
        work_summary = work.get('summary', '')
        work_highlights = ', '.join(work.get('highlights', []))
        if work_summary or work_highlights:
            work_paragraph += f"Position: {work.get('position', '')}\n"
            work_paragraph += f"Company: {work.get('name', '')}\n"
            work_paragraph += f"Summary: {work_summary}\n"
            if work_highlights:
                work_paragraph += f"Highlights: {work_highlights}\n"
            work_paragraph += "\n"

    # Extract 'education' section and combine them into a paragraph
    education_paragraph = ""
    for education in cv_data.get('education', []):
        institution = education.get('institution', '')
        area = education.get('area', '')
        study_type = education.get('studyType', '')
        start_date = education.get('startDate', '')
        end_date = education.get('endDate', '')
        courses = ', '.join(education.get('courses', []))

        education_paragraph += f"Institution: {institution}\n"
        education_paragraph += f"Area of Study: {area}\n"
        education_paragraph += f"Degree: {study_type}\n"
        education_paragraph += f"Start Date: {start_date}\n"
        education_paragraph += f"End Date: {end_date}\n"
        if courses:
            education_paragraph += f"Courses: {courses}\n"
        education_paragraph += "\n"

    # Extract 'skills' and 'projects' sections and combine them into a single paragraph
    skills_projects_paragraph = ""
    for skills in cv_data.get('skills', []):
        skill_name = skills.get('name', '')
        skill_level = skills.get('level', '')
        skill_keywords = ', '.join(skills.get('keywords', []))

        skills_projects_paragraph += f"Skill: {skill_name}\n"
        if skill_level:
            skills_projects_paragraph += f"Level: {skill_level}\n"
        if skill_keywords:
            skills_projects_paragraph += f"Keywords: {skill_keywords}\n"
        skills_projects_paragraph += "\n"

    # Create a DataFrame with 'id', 'work', 'education', and 'skills_projects' columns
    df = pd.DataFrame({
        'id': [cv_id],
        'work': [work_paragraph],
        'education': [education_paragraph],
        'skills_projects': [skills_projects_paragraph]
    })

    # Display the DataFrame
    # print(df.head())

except FileNotFoundError:
    print(f"File {file_path} not found.")


         id                                               work  \
0  10554236  Position: Accountant\nCompany: Company Name\nS...   

                                           education  \
0  Institution: Northern Maine Community College\...   

                                     skills_projects  
0  Skill: Accounting\n\nSkill: General Accounting...  


In [64]:
import pandas as pd
import json
import os

def process_json(file_path, folder_name):
    # Get the filename from the path
    file_name = os.path.basename(file_path)

    # Extract the id from the filename
    cv_id = os.path.splitext(file_name)[0]  # Assuming the format is something like '10554236.json'

    # Load JSON from the file
    try:
        with open(file_path, 'r') as file:
            cv_data = json.load(file)

        # Extract 'work' section and combine them into a paragraph
        work_paragraph = ""
        for work in cv_data.get('work', []):
            work_summary = work.get('summary', '')
            work_highlights = ', '.join(work.get('highlights', []))
            if work_summary or work_highlights:
                work_paragraph += f"Position: {work.get('position', '')}\n"
                work_paragraph += f"Company: {work.get('name', '')}\n"
                work_paragraph += f"Summary: {work_summary}\n"
                if work_highlights:
                    work_paragraph += f"Highlights: {work_highlights}\n"
                work_paragraph += "\n"

        # Extract 'education' section and combine them into a paragraph
        education_paragraph = ""
        for education in cv_data.get('education', []):
            institution = education.get('institution', '')
            area = education.get('area', '')
            study_type = education.get('studyType', '')
            start_date = education.get('startDate', '')
            end_date = education.get('endDate', '')
            courses = ', '.join(education.get('courses', []))

            education_paragraph += f"Institution: {institution}\n"
            education_paragraph += f"Area of Study: {area}\n"
            education_paragraph += f"Degree: {study_type}\n"
            education_paragraph += f"Start Date: {start_date}\n"
            education_paragraph += f"End Date: {end_date}\n"
            if courses:
                education_paragraph += f"Courses: {courses}\n"
            education_paragraph += "\n"

        # Extract 'skills' section and combine them into a paragraph
        skills_projects_paragraph = ""
        for skills in cv_data.get('skills', []):
            skill_name = skills.get('name', '')
            skill_level = skills.get('level', '')
            skill_keywords = ', '.join(skills.get('keywords', []))

            skills_projects_paragraph += f"Skill: {skill_name}\n"
            if skill_level:
                skills_projects_paragraph += f"Level: {skill_level}\n"
            if skill_keywords:
                skills_projects_paragraph += f"Keywords: {skill_keywords}\n"
            skills_projects_paragraph += "\n"

        # Create a DataFrame with 'id', 'folder_name', 'work', 'education', and 'skills_projects' columns
        df = pd.DataFrame({
            'id': [cv_id],
            'category': [folder_name],
            'work': [work_paragraph],
            'education': [education_paragraph],
            'skills_projects': [skills_projects_paragraph]
        })

        return df

    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return None
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        return None

def process_folder(folder_path):
    all_data = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                df = process_json(file_path, folder_name)
                if df is not None:
                    all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

# Specify the main folder containing subfolders like ACCOUNTANT, IT, etc.
folder_path = '../output_json'
final_df = process_folder(folder_path)

# Display the final DataFrame
print(final_df)


JSONDecodeError: Expecting value: line 20 column 2 (char 1220)
JSONDecodeError: Extra data: line 98 column 1 (char 3436)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Expecting ',' delimiter: line 33 column 5 (char 1731)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Invalid control character at: line 20 column 453 (char 1094)
JSONDecodeError: Expecting value: line 43 column 3 (char 2631)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Expecting value: line 117 column 48 (char 3334)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Expecting ',' delimiter: line 26 column 9 (char 883)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Expecting value: line 73 column 5 (char 2561)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Extra data: line 96 column 1 (char 2567

In [65]:
final_df

Unnamed: 0,id,category,work,education,skills_projects
0,10554236,ACCOUNTANT,Position: Accountant\nCompany: Company Name\nS...,Institution: Northern Maine Community College\...,Skill: Accounting\n\nSkill: General Accounting...
1,10674770,ACCOUNTANT,Position: Staff Accountant\nCompany: Company N...,Institution: University of North Carolina\nAre...,Skill: Accounting\nLevel: Master\nKeywords: Qu...
2,11163645,ACCOUNTANT,Position: Accountant\nCompany: Company Name\nS...,"Institution: Martinez Adult Education, Busines...",Skill: Accounting\nLevel: Advanced\nKeywords: ...
3,11759079,ACCOUNTANT,Position: Senior Accountant\nCompany: Company ...,"Institution: EMORY UNIVERSITY, Goizueta Busine...",Skill: Accounting\nLevel: Senior\nKeywords: ac...
4,12065211,ACCOUNTANT,Position: Senior Accountant\nCompany: Company ...,Institution: TEMPLE UNIVERSITY\nArea of Study:...,Skill: Accounting\nLevel: Senior\nKeywords: Ad...
...,...,...,...,...,...
1369,83816738,INFORMATION-TECHNOLOGY,Position: Information Technology Intern (Test ...,Institution: Lamar University\nArea of Study: ...,Skill: Java/J2EE\n\nSkill: JavaScript\n\nSkill...
1370,89413122,INFORMATION-TECHNOLOGY,Position: Operations Research Analyst\nCompany...,Institution: New York University\nArea of Stud...,Skill: Project Management\n\nSkill: Informatio...
1371,91121135,INFORMATION-TECHNOLOGY,"Position: Administrative Assistant Director, H...",Institution: Virginia High School\nArea of Stu...,Skill: Administrative Skills\nKeywords: Traini...
1372,91635250,INFORMATION-TECHNOLOGY,Position: Information Technology Specialist\nC...,Institution: University Of Advancing Technolog...,Skill: Information Technology\nKeywords: Route...


In [66]:
import os
import json

def find_files_with_errors(directory):
    error_files = []

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r') as f:
                        json.load(f)
                except json.JSONDecodeError as e:
                    error_files.append(file_path)
                except Exception as e:
                    print(f"An error occurred while processing {file_path}: {e}")
    
    return error_files

# Example usage
directory_path = '../output_json'
error_files = find_files_with_errors(directory_path)

print("Files with JSON errors:")
for error_file in error_files:
    print(error_file)


Files with JSON errors:
../output_json\ACCOUNTANT\17556527.json
../output_json\ACCOUNTANT\18569929.json
../output_json\ACCOUNTANT\20253563.json
../output_json\ACCOUNTANT\24103168.json
../output_json\ACCOUNTANT\24703009.json
../output_json\ACCOUNTANT\27558837.json
../output_json\ACCOUNTANT\28939941.json
../output_json\ACCOUNTANT\29456173.json
../output_json\ACCOUNTANT\36024962.json
../output_json\ACCOUNTANT\39115899.json
../output_json\ACCOUNTANT\43685045.json
../output_json\ACCOUNTANT\78403342.json
../output_json\ADVOCATE\10818478.json
../output_json\ADVOCATE\11773767.json
../output_json\ADVOCATE\13115648.json
../output_json\ADVOCATE\13809698.json
../output_json\ADVOCATE\18725071.json
../output_json\ADVOCATE\18997135.json
../output_json\ADVOCATE\19518606.json
../output_json\ADVOCATE\22042181.json
../output_json\ADVOCATE\26071861.json
../output_json\ADVOCATE\27182111.json
../output_json\ADVOCATE\29177904.json
../output_json\ADVOCATE\42164460.json
../output_json\ADVOCATE\47133747.json
..

In [67]:
final_df.to_csv("../data/cv data/cv_data.csv",index=False)