<a href="https://colab.research.google.com/github/B-Raghav/new/blob/main/loan_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re

# Set the folder containing your original text files
original_folder_path = '/content/bank_data/'  # Update this path as needed
# Set the folder for filtered text files
filtered_folder_path = '/content/filtered_bank_data/'

# Create the filtered folder if it doesn't exist
os.makedirs(filtered_folder_path, exist_ok=True)

# Define keywords for filtering relevant data
keywords = [
    'loan', 'interest', 'rate', 'amount', 'repayment', 'tenure',
    'education', 'financial', 'bank', 'scheme', 'details', 'terms',
    'conditions', 'monthly', 'payment', 'fees', 'eligibility'
]

# Compile a regex pattern for filtering
keywords_pattern = r'\b(?:' + '|'.join(keywords) + r')\b'

# Process each original text file in the folder
for filename in os.listdir(original_folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(original_folder_path, filename)

        # Read the content of the original text file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Remove irrelevant data
        filtered_content = '\n'.join(
            line for line in content.splitlines() if re.search(keywords_pattern, line, re.IGNORECASE)
        )

        # Save the filtered content to a new text file
        filtered_file_path = os.path.join(filtered_folder_path, f'filtered_{filename}')
        with open(filtered_file_path, 'w', encoding='utf-8') as filtered_file:
            filtered_file.write(filtered_content)

print(f"Filtered text files saved in: {filtered_folder_path}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/bank_data/'

In [None]:
import os
import re
import pandas as pd

# Set the folder containing your filtered text files
filtered_folder_path = '/content/filtered_bank_data/'

# Create a set to store unique extracted data
extracted_data = set()

# Define regex patterns for extracting loan amount and interest rate
loan_amount_pattern = r'(\d[\d,\.]*)\s*(lakh|lakhs|l|crore|crores)?'
interest_rate_pattern = r'(\d+(\.\d+)?)\s*%'

# Helper function to convert loan amounts into a uniform format
def convert_to_lakhs(loan_amount, unit):
    try:
        loan_amount = loan_amount.replace(',', '')  # Remove commas
        amount = float(loan_amount)  # Try converting to float

        if unit and unit.lower() in ['l', 'lakh', 'lakhs']:
            return amount
        elif unit and unit.lower() in ['crore', 'crores']:
            return amount * 100  # 1 crore = 100 lakhs
        elif not unit and amount >= 100000:  # If no unit is provided, assume it's in rupees
            return amount / 100000  # Convert rupees to lakhs
        return amount
    except ValueError:
        # If conversion fails, return None
        return None

# Function to check if the value looks like a year
def is_year(value):
    try:
        year = int(value)
        return 1000 <= year <= 3000  # Range typically for years
    except ValueError:
        return False

# Process each filtered text file in the folder
for filename in os.listdir(filtered_folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(filtered_folder_path, filename)

        # Read the content of the filtered text file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Extract bank name from the filename
        bank_name = filename.replace('filtered_', '').replace('.txt', '')

        # Search for loan amounts and interest rates in the content
        loan_amounts = re.findall(loan_amount_pattern, content, re.IGNORECASE)
        interest_rates = re.findall(interest_rate_pattern, content, re.IGNORECASE)

        # Filter out invalid loan amounts (those less than 10 lakhs or values resembling years)
        valid_loan_amounts = []
        for loan_amount, unit in loan_amounts:
            if is_year(loan_amount):
                continue  # Skip values that look like years
            converted_amount = convert_to_lakhs(loan_amount, unit)
            if converted_amount and converted_amount >= 10:
                valid_loan_amounts.append(f"{converted_amount} lakhs")

        # Filter interest rates that are between 1% and 50%
        valid_interest_rates = [rate[0] for rate in interest_rates if 1 <= float(rate[0]) <= 50]

        # Store unique combinations of loan amounts and interest rates
        for loan_amount_value in valid_loan_amounts:
            for interest_rate_value in valid_interest_rates:
                extracted_data.add((bank_name, loan_amount_value, interest_rate_value))

# Convert the set of unique data to a list for DataFrame creation
extracted_data_list = [{'Bank Name': bank, 'Loan Amount': loan_amount, 'Interest Rate (%)': interest_rate}
                        for bank, loan_amount, interest_rate in extracted_data]

# Create a DataFrame from the extracted data
extracted_df = pd.DataFrame(extracted_data_list)

# Save the extracted data to a CSV file for easier review
extracted_df.to_csv('/content/extracted_loan_data.csv', index=False)

# Display the extracted data
print(extracted_df)


             Bank Name    Loan Amount Interest Rate (%)
0    idfcfirstbank.com    888.0 lakhs               1.5
1      bankofbaroda.in   8500.0 lakhs                 2
2      bankofbaroda.in     12.0 lakhs                 2
3      bankofbaroda.in   2024.0 lakhs                 1
4         axisbank.com     50.0 lakhs                 8
..                 ...            ...               ...
177      icicibank.com   2500.0 lakhs                 2
178    bankofbaroda.in     15.0 lakhs                 2
179    bankofbaroda.in     10.0 lakhs              1.00
180  idfcfirstbank.com  10000.0 lakhs                 1
181       axisbank.com  11920.0 lakhs              6.50

[182 rows x 3 columns]


Loan Amount: 9, Interest Rate: 0%
Loan Amount: 1, Interest Rate: 5%
Loan Amount: 0.5, Interest Rate: 5%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 0.5, Interest Rate: 5%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.0, Interest Rate: 0%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 1.0, Interest Rate: 5%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 1.0, Interest Rate: 5%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.0, Interest Rate: 0%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.5, Interest Rate: 5%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 0.2, Interest Rate: 0%
Loan Amount: 2.0, Interest Rate: 0%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.5, Interest Rate: 5%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.0, Interest Rate: 0%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.5, Interest Rate: 5%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.0, Interest Rate: 0%
Loan Amount: 0.0, Interest Rate: 0%
Loan Amount: 2.5, Interest Rate: