In [3]:
import requests  
import os    
import pandas as pd  


def ocr_space_file(file_path, api_key='your_api_key'):  
    """OCR.space API request with local file."""  
    
    with open(file_path, 'rb') as f:  
        r = requests.post(  
            'https://api.ocr.space/parse/image',  
            files={file_path: f},  
            data={'apikey': api_key}  
        )  

    return r.json() 

def extract_text_from_folder(source_folder, destination_folder, api_key):  
    """Extract text from all files in the source folder, save to destination folder, and combine in CSV."""  
    
    if not os.path.exists(destination_folder):  
        os.makedirs(destination_folder)  # Create destination folder if it doesn't exist  
    
    extracted_data = []  # List to store extracted data for CSV  

    # Iterate through all files in the source folder  
    for file_name in os.listdir(source_folder):  
        file_path = os.path.join(source_folder, file_name)  
        
        if os.path.isfile(file_path):  # Process only files  
            print(f"Processing: {file_path}")  
            
            response = ocr_space_file(file_path, api_key)  
            
            if isinstance(response, dict) and response.get('ParsedResults'):  
                extracted_text = response['ParsedResults'][0]['ParsedText']  
                
                # Save the extracted text to a new file in the destination folder  
                output_file_name = os.path.splitext(file_name)[0] + '.txt'  # Change extension to .txt  
                output_file_path = os.path.join(destination_folder, output_file_name)  
                
                with open(output_file_path, 'w', encoding='utf-8') as output_file:  
                    output_file.write(extracted_text)  
                
                print(f"Extracted text saved to: {output_file_path}")  

                # Append the filename and extracted text to the list for CSV  
                extracted_data.append({'File Name': file_name, 'Resume': extracted_text})  
            else:  
                # Check if response is a string (error message)  
                if isinstance(response, str):  
                    print(f"Error processing {file_name}: {response}")  
                else:  
                    error_message = response.get('ErrorMessage', 'No error message available')  
                    print(f"Error processing {file_name}: {error_message}")  

    # Save all extracted data to a CSV file  
    if extracted_data:  
        df = pd.DataFrame(extracted_data)  # Create DataFrame from extracted data  
        csv_file_path = os.path.join(destination_folder, "extracted_texts.csv")  
        df.to_csv(csv_file_path, index=False, encoding='utf-8')  
        print(f"Combined CSV saved to: {csv_file_path}")  



In [4]:
source_folder = "ACCOUNTANT"  # Update this to your source folder path  
destination_folder = "New_extract"  # Folder to save extracted text  
api_key = 'K83146220988957'  # Replace with your actual API key  
    
# Calling the function  
extract_text_from_folder(source_folder, destination_folder, api_key)  
    
# Print the extracted text  
# if response.get('ParsedResults'):  
#     extracted_text = response['ParsedResults'][0]['ParsedText']  
#     print("Extracted Text:")  
#     print(extracted_text)  
# else:  
#     print(f"Error: {response.get('ErrorMessage')}")  

Processing: ACCOUNTANT\10329506.pdf
Extracted text saved to: New_extract\10329506.txt
Processing: ACCOUNTANT\10466583.pdf
Extracted text saved to: New_extract\10466583.txt
Processing: ACCOUNTANT\10554236.pdf
Extracted text saved to: New_extract\10554236.txt
Processing: ACCOUNTANT\10674770.pdf
Extracted text saved to: New_extract\10674770.txt
Processing: ACCOUNTANT\10748989.pdf
Extracted text saved to: New_extract\10748989.txt
Processing: ACCOUNTANT\10751444.pdf
Extracted text saved to: New_extract\10751444.txt
Processing: ACCOUNTANT\10909673.pdf
Extracted text saved to: New_extract\10909673.txt
Processing: ACCOUNTANT\10953078.pdf
Extracted text saved to: New_extract\10953078.txt
Processing: ACCOUNTANT\11065180.pdf
Extracted text saved to: New_extract\11065180.txt
Processing: ACCOUNTANT\11155153.pdf
Extracted text saved to: New_extract\11155153.txt
Processing: ACCOUNTANT\11163645.pdf
Extracted text saved to: New_extract\11163645.txt
Processing: ACCOUNTANT\11197262.pdf
Extracted text sav

In [5]:
source_folder = "ACCOUNTANT"  # Update this to your source folder path  
destination_folder = "New_extract"  # Folder to save extracted text  
api_key = 'K83146220988957'  # Replace with your actual API key  
    
# Define the folder containing your text files  
folder_path = 'New_extract'  

# Create an empty list to hold the resumes  
resumes = []  

# Loop through each file in the folder  
for filename in os.listdir(folder_path):  
    if filename.endswith('.txt'):  # Make sure you're only processing .txt files  
        file_path = os.path.join(folder_path, filename)  
        
        # Read the content of the text file  
        with open(file_path, 'r', encoding='utf-8') as file:  
            content = file.read()  
            resumes.append({'Resume': content})  # Add the content to the list  

# Create a DataFrame from the list  
df = pd.DataFrame(resumes)  

# Define the output CSV file path  
output_csv_path = 'output_resumes.csv'  

# Write the DataFrame to a CSV file  
df.to_csv(output_csv_path, index=False)  

print(f"Successfully combined resumes into {output_csv_path}.")  

Successfully combined resumes into output_resumes.csv.


In [1]:
from sklearn.cluster import KMeans  
from sklearn.decomposition import PCA  
from transformers import BertTokenizer, TFBertModel  
import torch  

# Load BERT model and tokenizer  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  
model = BertModel.from_pretrained('bert-base-uncased')  

# Load the CSV file with extracted resumes  
csv_file_path = 'output_resumes.csv'  # Update this path  
df = pd.read_csv(csv_file_path)  

# Function to generate BERT embeddings from text  
def get_bert_embeddings(text):  
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)  
    with torch.no_grad():  
        outputs = model(**inputs)  
    # Get the embeddings for the [CLS] token  
    return outputs.last_hidden_state[0][0].numpy()  

# Generate embeddings for each resume  
df['Embeddings'] = df['Resume'].apply(get_bert_embeddings)  

# Convert the embeddings into a matrix for clustering  
embedding_matrix = df['Embeddings'].to_list()  # Convert Series to list of numpy arrays  
embedded_matrix = np.array(embedding_matrix.tolist())  

# Apply K-Means clustering  
num_clusters = 5  # Adjust based on your needs  
kmeans = KMeans(n_clusters=num_clusters, random_state=0)  
df['Cluster'] = kmeans.fit_predict(embedded_matrix)  

# Optionally reduce dimensions for visualization  
pca = PCA(n_components=2)  
reduced_embeddings = pca.fit_transform(embedded_matrix)  

# Save the results  
output_csv_path = 'clustered_resumes.csv'  
df.to_csv(output_csv_path, index=False, encoding='utf-8')  

print(f"Clustered resumes saved to: {output_csv_path}")

ModuleNotFoundError: No module named 'torch'