<a href="https://colab.research.google.com/github/Coderdivine/UniqueWordDataset/blob/main/BankDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import pandas as pd

def extract_fields_from_xlsx(filename):
    # Read all sheets of the Excel file into a dictionary of pandas DataFrames
    all_sheets = pd.read_excel(filename, sheet_name=None)

    # Initialize variables to store extracted fields
    reference = []
    remarks = []
    transaction_details = []
    ref_no = []
    Details = []

    for sheet_name, df in all_sheets.items():
        # Iterate over the columns in the DataFrame
        print(f"*************** First Files {sheet_name} ***************")
        for column in df.columns:
            # Remove leading/trailing spaces and convert column name to lowercase
            cleaned_column = column.strip().lower()

            # Check if the cleaned column header matches the desired fields
            if 'reference' in cleaned_column:
                reference.extend(df[column].tolist())
            elif 'remarks' in cleaned_column:
                remarks.extend(df[column].tolist())
            elif 'transaction details' in cleaned_column:
                transaction_details.extend(df[column].tolist())
            elif 'ref no' in cleaned_column:
                ref_no.extend(df[column].tolist())
            elif 'details' in cleaned_column:
                Details.extend(df[column].tolist())


    # Return the extracted fields as a dictionary
    extracted_fields = {
        'Reference': reference,
        'Remarks': remarks,
        'Transaction Details': transaction_details,
        'Ref No': ref_no,
        'Details':Details
    }

    return extracted_fields

# Specify the path to the folder containing the XLSX files
folder_path = '/content/BankDatasets'

# Iterate over the files in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an XLSX file
    if filename.endswith('.xlsx'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)

        # Call the function to extract the fields
        extracted_data = extract_fields_from_xlsx(file_path)
        #print(extracted_data)

        # Access the extracted fields
        reference_list = extracted_data['Reference']
        remarks_list = extracted_data['Remarks']
        transaction_details_list = extracted_data['Transaction Details']
        ref_no_list = extracted_data['Ref No']
        details = extracted_data["Details"]
        print(extracted_data)

        # Process the extracted fields as needed
        # ...

        # Print or perform any other operations
        print(f"Processed file: {filename}")


*************** First Files Zenith Collections ***************
*************** First Files Bank Stmt ***************
{'Reference': [], 'Remarks': [], 'Transaction Details': [], 'Ref No': [], 'Details': []}
Processed file: Zenith - July (1).xlsx
*************** First Files Fidelity Collections ***************
*************** First Files Bank Stmt ***************
{'Reference': [], 'Remarks': [], 'Transaction Details': [], 'Ref No': [], 'Details': ['TAS255054/19612/ OZB COMMUNICATION', 'STAMP DUTY CHARGE - M15528 - 5020112970', 'TAS255149/19612/ OZB COMMUNICATION', 'STAMP DUTY CHARGE - M15787 - 5020112970', 'TAS255181/19612/ OZB COMMUNICATION', 'STAMP DUTY CHARGE - M16066 - 5020112970', 'TAS255203/19612/ OZB COMMUNICATION', 'STAMP DUTY CHARGE - M16218 - 5020112970', '19473/TAS255596/ZEALSTAR NIGERIA LIMITED', 'STAMP DUTY CHARGE - M30930 - 5020112970', 'COB TRANSFER FROM MARYABIK G **0272 C57792 19583MA', 'COB TRANSFER FROM GEETEE ROY **2077 C5782419833gee', 'TAS255180/19612/ OZB COMMUNICA

In [16]:
import pandas as pd
import numpy as np
import json

# Assuming you have the extracted fields stored in separate lists
transaction_details_list = extracted_data['Transaction Details']
reference_list = extracted_data['Reference']

# Create a DataFrame from the lists
df = pd.DataFrame({'sentence': transaction_details_list, 'reference': reference_list})
#df = pd.DataFrame({'transaction_details': transaction_details_list, 'reference': reference_list})

# Remove np.nan values and replace them with empty strings in transaction_details and reference columns
df['sentence'].fillna('', inplace=True)
df['reference'].fillna('', inplace=True)

# Initialize an empty list to store the unique patterns
unique_patterns = []

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    reference = row['reference']
    transaction_details = row['sentence']

    # Check if the previous and current transaction_details are different
    if index > 0 and transaction_details != df.loc[index - 1, 'sentence']:
        # Check the type of the transaction_details
        if isinstance(df.loc[index - 1, 'sentence'], str) and isinstance(transaction_details, str):
            # Find unique patterns between the previous and current transaction_details
            unique_words = list(set(df.loc[index - 1, 'sentence'].split()) - set(transaction_details.split()))
            if not unique_words:
                unique_words = [""]  # Set empty list to [""] if no unique words found
            unique_patterns.append({'sentence': transaction_details, 'unique_words': unique_words})
        else:
            # Handle non-string transaction_details by returning an empty string
            unique_patterns.append({'sentence': transaction_details, 'unique_words': [""]})
    else:
        # Handle the first row where there is no previous transaction_details to compare with
        if isinstance(transaction_details, str):
            unique_words = list(set(transaction_details.split()))
            if not unique_words:
                unique_words = [""]  # Set empty list to [""] if no unique words found
            unique_patterns.append({'sentence': transaction_details, 'unique_words': unique_words})

# Convert the list of unique patterns to JSON format
new_df_json = json.dumps(unique_patterns)
new_df = pd.DataFrame(unique_patterns)

# Print the JSON data
print(new_df_json)
print(new_df)


[{"sentence": "", "unique_words": [""]}, {"sentence": "ELEC MONEY TRSF LEVY-9 TXNS: 30-04-22 TO 30-04-22", "unique_words": [""]}, {"sentence": "FIP:FTS/TRF/MTN NIGERIA AUTOTOPUP SETTLEMENT", "unique_words": ["TO", "TRSF", "TXNS:", "MONEY", "ELEC", "LEVY-9", "30-04-22"]}, {"sentence": "FIP:/NELLOBYTE SYSTEMS /TAS220287 NELLOBYTE SME002", "unique_words": ["AUTOTOPUP", "NIGERIA", "FIP:FTS/TRF/MTN", "SETTLEMENT"]}, {"sentence": "FIP:/NELLOBYTE SYSTEMS /TAS220286 21214 NELLOBYTE/", "unique_words": ["SME002", "NELLOBYTE", "/TAS220287"]}, {"sentence": "FIP:/TRF/045993697775:MTN NIGERIA AUT", "unique_words": ["SYSTEMS", "NELLOBYTE/", "/TAS220286", "21214", "FIP:/NELLOBYTE"]}, {"sentence": "FT from SEAMAN VENTURES/TAS220937/19386/SEAMAN/ANA", "unique_words": ["NIGERIA", "FIP:/TRF/045993697775:MTN", "AUT"]}, {"sentence": "SWEEP BALANCE TRANSFER", "unique_words": ["FT", "VENTURES/TAS220937/19386/SEAMAN/ANA", "from", "SEAMAN"]}, {"sentence": "ELEC MONEY TRSF LEVY-5 TXNS: 02-05-22 TO 02-05-22", "un

In [None]:
!pip install scikit-multilearn transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch


print(new_df['unique_words'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    new_df['sentence'], new_df['unique_words'], test_size=0.2, random_state=42
)

# Transforming text into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test).toarray()

# Converting the target variable to a binary indicator matrix
mlb = MultiLabelBinarizer()
y_train_binary = mlb.fit_transform(y_train)
y_test_binary = mlb.transform(y_test)

# Fine-tuning BERT on our specific task
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, labels):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        text = item['sentence']
        label = self.labels[index]

        # Tokenize the text
        inputs = self.tokenizer(
            text=text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'][0],  # Access the tokenized input_ids
            'attention_mask': inputs['attention_mask'][0],  # Access the attention mask
            'labels': torch.tensor(label, dtype=torch.float)
        }

train_dataset = CustomDataset(df.iloc[X_train.index], tokenizer, max_length=128, labels=y_train_binary)
test_dataset = CustomDataset(df.iloc[X_test.index], tokenizer, max_length=128, labels=y_test_binary)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_)).to(device)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop for fine-tuning BERT
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")

# Evaluation
model.eval()
with torch.no_grad():
    y_pred_probs = []
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        y_pred_probs.extend(torch.sigmoid(logits).cpu().numpy())

    y_pred_probs = np.array(y_pred_probs)
    y_pred_binary = (y_pred_probs >= 0.5).astype(int)

# Calculating accuracy
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print("Accuracy:", accuracy)

# Calculating loss (log loss)
loss = log_loss(y_test_binary, y_pred_probs)
print("Loss:", loss)


In [None]:
# Example sentence for prediction
new_sentence = "TAS247636/19650/INTERSWITCH"

# Tokenize the new sentence
inputs = tokenizer(
    text=new_sentence,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Move inputs to the device
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

# Make prediction
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_probs = torch.sigmoid(logits).cpu().numpy()

# Convert probabilities to binary predictions (using threshold 0.5)
predicted_binary = (predicted_probs >= 0.5).astype(int)

# Inverse transform the binary predictions to obtain the predicted unique words
predicted_unique_words = mlb.inverse_transform(predicted_binary)

print("Predicted unique words:", predicted_unique_words)
