In [3]:
import pandas as pd
import re
import pdfplumber

def split_rows(row):
    # Split the row by '\n' if any element contains '\n'
    if any(isinstance(val, str) and '\n' in val for val in row):
        split_values = [str(val).split('\n') if isinstance(val, str) else [val] for val in row]
        # Determine the maximum length of sublists in the original data
        max_length = max(len(sublist) for sublist in split_values)
        # Fill shorter sublists with empty strings to match the maximum length
        filled_data = [sublist + [''] * (max_length - len(sublist)) for sublist in split_values]
        # Use zip to transpose the list and then convert it to a list of lists
        restructured_data = [list(row) for row in zip(*filled_data)]
        return restructured_data
    else:
        return [list(row)]

def correct_df_list(df_list, rows):
    corrected_df_list = []
    i = 0
    while i < len(df_list):
        current_row = df_list[i]
        
        # Check if the next row exists and if it should be merged
        if i + 1 < len(df_list) and pd.isna(df_list[i+1].iloc[0, 0]):
            next_row = df_list[i+1]
            
            # Combine the two rows
            combined_row = pd.concat([current_row.iloc[:, :2], next_row.iloc[:, 2:]], axis=1)
            
            # Find the corresponding row in the extracted text
            date = current_row.iloc[0, 1]
            matched_row = next((row for row in rows if row.startswith(date)), None)
            
            if matched_row:
                # Split the matched row and update the combined row
                parts = re.split(r'\s{2,}', matched_row.strip())
                if len(parts) >= 4:
                    combined_row.iloc[0, 0] = parts[0]  # Date
                    combined_row.iloc[0, 1] = ' '.join(parts[1:-2])  # Description
                    combined_row.iloc[0, 2] = parts[-2]  # Deposit/Withdrawal
                    combined_row.iloc[0, 3] = parts[-1]  # Balance
            
            corrected_df_list.append(combined_row)
            i += 2
        else:
            corrected_df_list.append(current_row)
            i += 1
    
    return pd.concat(corrected_df_list, ignore_index=True)

file_path = 'C:/Users/User/Downloads/Merged stmt (1).pdf'

pdf = pdfplumber.open(file_path)

df_list = []  # Use a list to store DataFrames

for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    table = page.extract_table(table_settings={"horizontal_strategy": "text"})

    if table:
        table_df = pd.DataFrame(table[2:])
        if len(table_df.columns) == 5:
            for _, row in table_df.iterrows():
                split_row_list = split_rows(row)
                for split_row in split_row_list:
                    df_list.append(pd.DataFrame([split_row], columns=table_df.columns))
page_num = 0
with pdfplumber.open(file_path) as pdf:
    bal = []
    text = ""
    for page in pdf.pages:
        page_num += 1
        text = f'{text} \n{page.extract_text()}'
rows = text.split('\n')
# Usage
corrected_df = correct_df_list(df_list, rows)

In [59]:
def check_and_fill(df_list):
    df = pd.concat(df_list, ignore_index=True)
    AMOUNT_REGEX = r'\.\d{2}'
    
    for i in range(1, len(df) - 1):
        try:
            # Check if first column is empty string and any of columns 2, 3, 4 match the regex
            if df.iloc[i, 0] == "" and any(isinstance(df.iloc[i, col], str) and 
                                           re.search(AMOUNT_REGEX, str(df.iloc[i, col])[-3:]) 
                                           for col in [2, 3, 4]):
                
                matching_columns = [col for col in [2, 3, 4] 
                                    if isinstance(df.iloc[i, col], str) and 
                                    re.search(AMOUNT_REGEX, str(df.iloc[i, col])[-3:])]
                
                if matching_columns:
                    print(f"Row {i}: Matching columns: {matching_columns}")
                    print(f"Values: {[df.iloc[i, col] for col in matching_columns]}")
                    
                    prev_amt_check = all(pd.isna(df.iloc[i-1, j]) or df.iloc[i-1, j] == "" for j in matching_columns)
                    next_amt_check = all(pd.isna(df.iloc[i+1, j]) or df.iloc[i+1, j] == "" for j in matching_columns)
                    prev_date_check = all(df.iloc[i-1, j] != "" for j in [0,1] if pd.notna(df.iloc[i-1, j]))
                    next_date_check = all(df.iloc[i+1, j] != "" for j in [0,1] if pd.notna(df.iloc[i+1, j]))
                    
                    if next_amt_check and next_date_check:
                        for col in matching_columns:
                            df.iloc[i+1, col] = df.iloc[i, col]
                            df.iloc[i, col] = ""
                    elif prev_amt_check and prev_date_check:
                        for col in matching_columns:
                            df.iloc[i-1, col] = df.iloc[i, col]
                            df.iloc[i, col] = ""
        
        except Exception as e:
            print(f"Error processing row {i}: {str(e)}")
            print(f"Row data: {df.iloc[i]}")
    
    return df

In [60]:
file_path = 'C:/Users/User/Downloads/Merged stmt (1).pdf'

pdf = pdfplumber.open(file_path)

df_list = []  # Use a list to store DataFrames

for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    table = page.extract_table(table_settings={"horizontal_strategy": "text"})

    if table:
        table_df = pd.DataFrame(table[2:])
        if len(table_df.columns) == 5:
            for _, row in table_df.iterrows():
                split_row_list = split_rows(row)
                for split_row in split_row_list:
                    df_list.append(pd.DataFrame([split_row], columns=table_df.columns))
# Usage
corrected_df = check_and_fill(df_list)

Row 1: Matching columns: [4]
Values: ['7,609.38']
Row 24: Matching columns: [2, 4]
Values: ['1,220.00', '8,340.88']
Row 52: Matching columns: [4]
Values: ['6,910.38']
Row 117: Matching columns: [2]
Values: ['1,350.00']
Row 147: Matching columns: [4]
Values: ['12,965.88']


In [61]:
corrected_df

Unnamed: 0,0,1,2,3,4
0,,,,,
1,,Balance from previous statement,,,7609.38
2,,,,,
3,30-03-2024,Instant Transfer at KLM,1195.00,,8804.38
4,,Cake Payment,,,
...,...,...,...,...,...
246,,20240625CIBBMYKL010ORM46906467,,,
247,,,,,
248,Total Withdrawals / Jumlah Pengeluaran : 8 3 6...,,,,
249,,,,,


In [62]:
corrected_df.to_csv("test.csv")

In [6]:
from pdfminer.high_level import extract_pages
import os
from pdfminer.layout import LTTextContainer, LTChar
from python.font_check import text_extraction, process_fonts, extract_fonts, draw_rectangles

file_path = "C:/Users/User/OneDrive/EVERAI/WEB APP/OCR/SG BANK STATEMENT/DBS/Meat Bs.pdf"


results = []
for pagenum, page in enumerate(extract_pages(file_path)):

    # Iterate the elements that composed a page
    for element in page:

        # Check if the element is a text element
        if isinstance(element, LTTextContainer):
            result = text_extraction(element)
            results.append((pagenum, result))

font_data = extract_fonts(file_path)

print(font_data)

[{'/R31': 'YGCBRD+OpenSans-BoldItalic', '/R27': 'MCCQYA+OpenSans-Regular', '/R35': 'OWTPJK+OpenSans-Bold'}, {'/R27': 'MCCQYA+OpenSans-Regular', '/R35': 'OWTPJK+OpenSans-Bold'}, {'/R27': 'MCCQYA+OpenSans-Regular', '/R35': 'OWTPJK+OpenSans-Bold'}, {'/R27': 'MCCQYA+OpenSans-Regular', '/R35': 'OWTPJK+OpenSans-Bold'}, {'/R27': 'MCCQYA+OpenSans-Regular', '/R35': 'OWTPJK+OpenSans-Bold'}, {'/R27': 'MCCQYA+OpenSans-Regular', '/R35': 'OWTPJK+OpenSans-Bold'}, {'/R173': 'YGCBRD+OpenSans-BoldItalic', '/R170': 'MCCQYA+OpenSans-Regular', '/R175': 'OWTPJK+OpenSans-Bold'}, {'/R170': 'MCCQYA+OpenSans-Regular', '/R175': 'OWTPJK+OpenSans-Bold'}, {'/R170': 'MCCQYA+OpenSans-Regular', '/R175': 'OWTPJK+OpenSans-Bold'}, {'/R170': 'MCCQYA+OpenSans-Regular', '/R175': 'OWTPJK+OpenSans-Bold'}, {'/R170': 'MCCQYA+OpenSans-Regular', '/R175': 'OWTPJK+OpenSans-Bold'}, {'/R170': 'MCCQYA+OpenSans-Regular', '/R175': 'OWTPJK+OpenSans-Bold'}, {'/R312': 'YGCBRD+OpenSans-BoldItalic', '/R309': 'WHHKAL+OpenSans-Regular', '/R314

In [None]:
def check_and_fill(df_list):
    df = pd.concat(df_list, ignore_index=True)
    AMOUNT_REGEX = r'\.\d{2}'
    
    for i in range(1, len(df) - 1):
        try:
            # Check if first column is empty string and any of columns 2, 3, 4 match the regex
            if df.iloc[i, 0] == "" and df.iloc[i, 1] != "Balance from previous statement" and any(isinstance(df.iloc[i, col], str) and 
                                           re.search(AMOUNT_REGEX, str(df.iloc[i, col])[-3:]) 
                                           for col in [2, 3, 4]):
                
                matching_columns = [col for col in [2, 3, 4] 
                                    if isinstance(df.iloc[i, col], str) and 
                                    re.search(AMOUNT_REGEX, str(df.iloc[i, col])[-3:])]
                print(matching_columns)
                if matching_columns:
                    prev_amt_check = all(pd.isna(df.iloc[i-1, j]) or df.iloc[i-1, j] == "" for j in matching_columns)
                    next_amt_check = all(pd.isna(df.iloc[i+1, j]) or df.iloc[i+1, j] == "" for j in matching_columns)
                    prev_date_check = all(df.iloc[i-1, j] != "" for j in [0,1] if pd.notna(df.iloc[i-1, j]))
                    next_date_check = all(df.iloc[i+1, j] != "" for j in [0,1] if pd.notna(df.iloc[i+1, j]))
                    
                    if next_amt_check and next_date_check:
                        for col in matching_columns:
                            df.iloc[i+1, col] = df.iloc[i, col]
                            df.iloc[i, col] = ""
                    elif prev_amt_check and prev_date_check:
                        for col in matching_columns:
                            df.iloc[i-1, col] = df.iloc[i, col]
                            df.iloc[i, col] = ""
        
        except Exception as e:
            print(f"Error processing row {i}: {str(e)}")
            print(f"Row data: {df.iloc[i]}")
    
    return df

In [38]:
import pdfplumber
import pandas as pd
import re

def split_rows(row):
    # Split the row by '\n' if any element contains '\n'
    if any(isinstance(val, str) and '\n' in val for val in row):
        split_values = [str(val).split('\n') if isinstance(val, str) else [val] for val in row]
        # Determine the maximum length of sublists in the original data
        max_length = max(len(sublist) for sublist in split_values)
        # Fill shorter sublists with empty strings to match the maximum length
        filled_data = [sublist + [''] * (max_length - len(sublist)) for sublist in split_values]
        # Use zip to transpose the list and then convert it to a list of lists
        restructured_data = [list(row) for row in zip(*filled_data)]
        return restructured_data
    else:
        return [list(row)]

pdf = pdfplumber.open("C:\\Users\\User\\Downloads\\HLB_Receipt_166593[1].pdf")

df_list = []  # Use a list to store DataFrames

for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    table = page.extract_table(table_settings={"horizontal_strategy": "text"})

    if table:
        table_df = pd.DataFrame(table[2:])
        if len(table_df.columns) == 5:
            for _, row in table_df.iterrows():
                split_row_list = split_rows(row)
                for split_row in split_row_list:
                    df_list.append(pd.DataFrame([split_row], columns=table_df.columns))

# Concatenate all DataFrames in the list into a single DataFrame
all_data_df = pd.concat(df_list, ignore_index=True)

# Create boolean masks for non-null, non-empty string checks
mask_col3_valid = all_data_df[all_data_df.columns[2]].notnull() & (all_data_df[all_data_df.columns[2]].str.strip() == '')
mask_col4_valid = all_data_df[all_data_df.columns[3]].notnull() & (all_data_df[all_data_df.columns[3]].str.strip() == '')

# Apply the masks to filter the DataFrame
filtered_df = all_data_df[mask_col3_valid & mask_col4_valid]

# Define the regex pattern for the date
DATE_REGEX = r'\d{2}-\d{2}-\d{4}'

# Filter rows based on the date pattern in the first column
date_filtered_df = filtered_df[filtered_df[filtered_df.columns[0]].str.match(DATE_REGEX, na=False)]

# Select only the first two columns
first_two_columns_df = date_filtered_df.iloc[:, :2]

# Convert the selected columns to a list of strings
list_of_strings = first_two_columns_df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()


print(list_of_strings)


['31-01-2024 Fund Trf fr CA to CA-Internet']


In [39]:
with pdfplumber.open("C:\\Users\\User\\Downloads\\HLB_Receipt_166593[1].pdf") as pdf:
    bal = []
    text = ""
    for page in pdf.pages:
        text = f'{text} \n{page.extract_text()}'
alltext = text.split('\n')
print(alltext)

[' ', '1505ae80d0edaecbec7d67087934f83e', 'CURRENT ACCOUNT-i STATEMENT', 'Protected by PIDM up to RM250,000 for each depositor.', 'Dilindungi oleh PIDM setakat RM250,000 bagi setiap pendeposit.', 'STM', 'Page No / No Mukasurat : 1 of 11', 'PASARAYA UPSANA', 'Date / Tarikh : 13-02-2024', 'PT 1618 JALAN TERUNTUM 12/KU8 BATU 5 1/2', 'A/C No /No Akaun : 34901008042 MYR', 'JALAN MERU Statement Period / : 14/01/24 - 13/02/24', '41050 KLANG Tempoh Penyataan', 'Branch / Cawangan : SETIA ALAM, SHAH ALAM', 'Tel No / No Tel : 03-3344 6888', 'Date Transaction Description Deposit Withdrawal Balance', 'Tarikh Deskripsi Transaksi Simpanan Pengeluaran Baki', 'Balance from previous statement 16,254.95', '14-01-2024 HLConnect DuitNow-previously Inst 1,200.00', 'Fund transfer', 'NORLIZA BINTI ARIF@N', '20240114HLBBMYKL010ORM43662012', '14-01-2024 HLConnect DuitNow-previously Inst 1,655.00', 'Fund transfer', 'NORLIZA BINTI ARIF@N', '20240114HLBBMYKL010ORM44453434', '14-01-2024 Fund Trf fr CA to CA-Interne

In [42]:
for s in list_of_strings:
    for r in alltext:
        if s in r:
            print(r)
            amt = r.split(" ")[-1]
            print(amt)

31-01-2024 Fund Trf fr CA to CA-Internet 8,164.00
8,164.00


In [44]:
import pdfplumber
import pandas as pd
import re

def split_rows(row):
    if any(isinstance(val, str) and '\n' in val for val in row):
        split_values = [str(val).split('\n') if isinstance(val, str) else [val] for val in row]
        max_length = max(len(sublist) for sublist in split_values)
        filled_data = [sublist + [''] * (max_length - len(sublist)) for sublist in split_values]
        restructured_data = [list(row) for row in zip(*filled_data)]
        return restructured_data
    else:
        return [list(row)]

pdf_path = "C:\\Users\\User\\Downloads\\HLB_Receipt_166593[1].pdf"
pdf = pdfplumber.open(pdf_path)

df_list = []

for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    table = page.extract_table(table_settings={"horizontal_strategy": "text"})

    if table:
        table_df = pd.DataFrame(table[2:])
        if len(table_df.columns) == 5:
            for _, row in table_df.iterrows():
                split_row_list = split_rows(row)
                for split_row in split_row_list:
                    df_list.append(pd.DataFrame([split_row], columns=table_df.columns))

all_data_df = pd.concat(df_list, ignore_index=True)

mask_col3_valid = all_data_df[all_data_df.columns[2]].notnull() & (all_data_df[all_data_df.columns[2]].str.strip() == '')
mask_col4_valid = all_data_df[all_data_df.columns[3]].notnull() & (all_data_df[all_data_df.columns[3]].str.strip() == '')

filtered_df = all_data_df[mask_col3_valid & mask_col4_valid]

DATE_REGEX = r'\d{2}-\d{2}-\d{4}'
date_filtered_df = filtered_df[filtered_df[filtered_df.columns[0]].str.match(DATE_REGEX, na=False)]

first_two_columns_df = date_filtered_df.iloc[:, :2]
list_of_strings = first_two_columns_df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()

def find_surrounding_rows(s, all_data_df):
    for idx, row in all_data_df.iterrows():
        row_str = ' '.join(row.astype(str))
        if s in row_str:
            prev_row = all_data_df.iloc[idx - 1] if idx > 0 else None
            current_row = row
            next_row = all_data_df.iloc[idx + 1] if idx < len(all_data_df) - 1 else None
            return prev_row, current_row, next_row, idx
    return None, None, None, None

# Find the specific value and update the current row
value_to_find = amt
for s in list_of_strings:
    prev_row, current_row, next_row, current_idx = find_surrounding_rows(s, all_data_df)
    if prev_row is not None and current_row is not None:
        # Find the row containing the value "8,164.00"
        value_row_idx = all_data_df.apply(lambda row: row.astype(str).str.contains(value_to_find).any(), axis=1)
        value_row_idx = all_data_df[value_row_idx].index

        if len(value_row_idx) > 0:
            # Assuming you want the first occurrence
            value_row_idx = value_row_idx[0]

            # Update the current row with the found value
            updated_row = all_data_df.iloc[value_row_idx]
            all_data_df.iloc[current_idx] = updated_row

            # Remove the value from the rows (set to empty or NaN)
            all_data_df.replace(value_to_find, '', inplace=True)
            
            print(f"Previous row:\n{prev_row}")
            print(f"Updated current row:\n{all_data_df.iloc[current_idx]}")
            print(f"Next row:\n{next_row}")


Previous row:
0     30-01-2024
1    CDM Deposit
2         210.00
3               
4      28,437.83
Name: 283, dtype: object
Updated current row:
0                       15-01-2024
1    Fund Trf fr CA to CA-Internet
2                                 
3                                 
4                                 
Name: 284, dtype: object
Next row:
0                 
1    Fund transfer
2                 
3                 
4                 
Name: 285, dtype: object


In [54]:
import pdfplumber
import pandas as pd
import re

def split_rows(row):
    if any(isinstance(val, str) and '\n' in val for val in row):
        split_values = [str(val).split('\n') if isinstance(val, str) else [val] for val in row]
        max_length = max(len(sublist) for sublist in split_values)
        filled_data = [sublist + [''] * (max_length - len(sublist)) for sublist in split_values]
        restructured_data = [list(row) for row in zip(*filled_data)]
        return restructured_data
    else:
        return [list(row)]

pdf_path = "C:\\Users\\User\\Downloads\\HLB_Receipt_166593[1].pdf"
pdf = pdfplumber.open(pdf_path)

df_list = []

for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    table = page.extract_table(table_settings={"horizontal_strategy": "text"})

    if table:
        table_df = pd.DataFrame(table[2:])
        if len(table_df.columns) == 5:
            for _, row in table_df.iterrows():
                split_row_list = split_rows(row)
                for split_row in split_row_list:
                    df_list.append(pd.DataFrame([split_row], columns=table_df.columns))

all_data_df = pd.concat(df_list, ignore_index=True)

mask_col3_valid = all_data_df[all_data_df.columns[2]].notnull() & (all_data_df[all_data_df.columns[2]].str.strip() == '')
mask_col4_valid = all_data_df[all_data_df.columns[3]].notnull() & (all_data_df[all_data_df.columns[3]].str.strip() == '')

filtered_df = all_data_df[mask_col3_valid & mask_col4_valid]

DATE_REGEX = r'\d{2}-\d{2}-\d{4}'
date_filtered_df = filtered_df[filtered_df[filtered_df.columns[0]].str.match(DATE_REGEX, na=False)]

first_two_columns_df = date_filtered_df.iloc[:, :2]
list_of_strings = first_two_columns_df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()

def find_surrounding_rows(s, all_data_df):
    for idx, row in all_data_df.iterrows():
        row_str = ' '.join(row.astype(str))
        if s in row_str:
            prev_row = all_data_df.iloc[idx - 1] if idx > 0 else None
            current_row = row
            next_row = all_data_df.iloc[idx + 1] if idx < len(all_data_df) - 1 else None
            return prev_row.tolist(), current_row.tolist(), next_row.tolist(), idx
    return None, None, None, None

value_to_find = "8,164.00"
for s in list_of_strings:
    prev_row, current_row, next_row, current_idx = find_surrounding_rows(s, all_data_df)
    print(prev_row)
    print(next_row)
    if prev_row is not None:
        try:
            # Get the index of value_to_find in prev_row
            value_index = prev_row.index(value_to_find)
            print(f"Value '{value_to_find}' found at index {value_index} in previous row.")
            prev_row[value_index] = ''
            current_row[value_index] = value_to_find
        except ValueError:
            print(f"Value '{value_to_find}' not found in previous row.")
    
    if next_row is not None:
        try:
            # Get the index of value_to_find in next_row
            value_index = next_row.index(value_to_find)
            print(f"Value '{value_to_find}' found at index {value_index} in next row.")
            next_row[value_index] = ''
            current_row[value_index] = value_to_find
        except ValueError:
            print(f"Value '{value_to_find}' not found in next row.")
print(prev_row, current_row, next_row)

if prev_row is not None:
    all_data_df.iloc[current_idx - 1] = prev_row
all_data_df.iloc[current_idx] = current_row
if next_row is not None:
    all_data_df.iloc[current_idx + 1] = next_row

all_data_df.to_csv('test2.csv')

['30-01-2024', 'CDM Deposit', '210.00', '8,164.00', '28,437.83']
['', 'Fund transfer', '', '', '']
Value '8,164.00' found at index 3 in previous row.
Value '8,164.00' not found in next row.
['30-01-2024', 'CDM Deposit', '210.00', '', '28,437.83'] ['31-01-2024', 'Fund Trf fr CA to CA-Internet', '', '8,164.00', ''] ['', 'Fund transfer', '', '', '']


In [65]:
import pdfplumber
import pandas as pd
import re

def split_rows(row):
    if any(isinstance(val, str) and '\n' in val for val in row):
        split_values = [str(val).split('\n') if isinstance(val, str) else [val] for val in row]
        max_length = max(len(sublist) for sublist in split_values)
        filled_data = [sublist + [''] * (max_length - len(sublist)) for sublist in split_values]
        restructured_data = [list(row) for row in zip(*filled_data)]
        return restructured_data
    else:
        return [list(row)]

pdf_path = "C:\\Users\\User\\Downloads\\HLB_Receipt_166593[1].pdf"
pdf = pdfplumber.open(pdf_path)

df_list = []

for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    table = page.extract_table(table_settings={"horizontal_strategy": "text"})

    if table:
        table_df = pd.DataFrame(table[2:])
        if len(table_df.columns) == 5:
            for _, row in table_df.iterrows():
                split_row_list = split_rows(row)
                for split_row in split_row_list:
                    df_list.append(pd.DataFrame([split_row], columns=table_df.columns))

all_data_df = pd.concat(df_list, ignore_index=True)

mask_col3_valid = all_data_df[all_data_df.columns[2]].notnull() & (all_data_df[all_data_df.columns[2]].str.strip() == '')
mask_col4_valid = all_data_df[all_data_df.columns[3]].notnull() & (all_data_df[all_data_df.columns[3]].str.strip() == '')

filtered_df = all_data_df[mask_col3_valid & mask_col4_valid]

DATE_REGEX = r'\d{2}-\d{2}-\d{4}'
date_filtered_df = filtered_df[filtered_df[filtered_df.columns[0]].str.match(DATE_REGEX, na=False)]

first_two_columns_df = date_filtered_df.iloc[:, :2]
list_of_strings = first_two_columns_df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()

with pdfplumber.open(pdf_path) as pdf:
    alltext = []
    for page in pdf.pages:
        alltext.extend(page.extract_text().split('\n'))

# Find value_to_find in alltext
value_to_find = None
for s in list_of_strings:
    for r in alltext:
        if s in r:
            print(f"Matching text: {r}")
            value_to_find = r.split(" ")[-1]
            print(f"Value to find: {value_to_find}")
            if value_to_find:
                prev_row, current_row, next_row, current_idx = find_surrounding_rows(s, all_data_df)
                
                if prev_row is not None:
                    try:
                        # Get the index of value_to_find in prev_row
                        value_index = prev_row.index(value_to_find)
                        prev_row[value_index] = ''
                        current_row[value_index] = value_to_find
                    except ValueError:
                        print(f"Value '{value_to_find}' not found in previous row.")
                
                if next_row is not None:
                    try:
                        # Get the index of value_to_find in next_row
                        value_index = next_row.index(value_to_find)
                        next_row[value_index] = ''
                        current_row[value_index] = value_to_find
                    except ValueError:
                        print(f"Value '{value_to_find}' not found in next row.")
                
                # Update the DataFrame with modified rows
                if prev_row is not None:
                    all_data_df.iloc[current_idx - 1] = prev_row
                all_data_df.iloc[current_idx] = current_row
                if next_row is not None:
                    all_data_df.iloc[current_idx + 1] = next_row
                all_data_df.to_csv('test2.csv')
                print("Updated list_of_strings:", list_of_strings)
            else:
                print("Value to find not found.")


Matching text: 31-01-2024 Fund Trf fr CA to CA-Internet 8,164.00
Value to find: 8,164.00
Value '8,164.00' not found in next row.
Updated list_of_strings: ['31-01-2024 Fund Trf fr CA to CA-Internet']
