<a href="https://colab.research.google.com/github/Arunkarthik-K/Assessment_FinacPlus/blob/main/Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !unzip data.zip

In [2]:
import re
import nltk
import string
from nltk.corpus import stopwords

# Download the stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    # Remove numbers
    text = re.sub(r'\d+(\.\d+)?', '', text)
    # Remove punctuations and symbols
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
import os
import pandas as pd
from bs4 import BeautifulSoup

# Path to your data
data_path = "/content/data"

# Initialize an empty list to store data
data = []

# Function to get the number of columns and rows
def get_table_dimensions(soup):
    tables = soup.find_all('table')
    if tables:
        table = tables[0]
        rows = table.find_all('tr')
        no_of_rows = len(rows)
        if rows:
            no_of_columns = len(rows[0].find_all(['th', 'td']))
        else:
            no_of_columns = 0
    else:
        no_of_rows = no_of_columns = 0
    return no_of_rows, no_of_columns

# Iterate through each folder
for folder in os.listdir(data_path):
    folder_path = os.path.join(data_path, folder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            if file_name.endswith(".html"):
                with open(file_path, 'r', encoding='utf-8') as file:
                    soup = BeautifulSoup(file, 'html.parser')
                    text = soup.get_text(separator=' ')
                    no_of_rows, no_of_columns = get_table_dimensions(soup)

                    # Clean the extracted text
                    # cleaned_text = clean_text(text)

                    data.append({
                        "folder": folder,
                        "file_name": file_name,
                        "no_of_rows": no_of_rows,
                        "no_of_columns": no_of_columns,
                        "text": text
                    })

# Convert to DataFrame
df = pd.DataFrame(data)


In [4]:
df.head()

Unnamed: 0,folder,file_name,no_of_rows,no_of_columns,text
0,Cash Flow,18630222_table_137.html,20,3,\n \n Particulars \n Year ended \n Year ended ...
1,Cash Flow,18661964_table_39.html,40,1,\n \n CASH FLOW STATEMENT FOR THE YEAR ENDED 3...
2,Cash Flow,18630219_table_52.html,14,1,\n \n B. CASH FLOW FROM INVESTING ACTIVITIES: ...
3,Cash Flow,18646845_table_53.html,11,3,\n \n 31 December 2017 INR in Lacs \n 31 Decem...
4,Cash Flow,18646845_table_52.html,49,1,\n \n For the year ended 31 st December 2017 ...


In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
embedding_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model.to(device)

# Function to get embeddings
def get_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = embedding_model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Apply to the DataFrame
texts = df['text'].tolist()
df['embeddings'] = get_embeddings(texts)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
from imblearn.over_sampling import SMOTE
import numpy as np

# Prepare data for SMOTE
X = np.array(df['embeddings'].tolist())
y = df['folder']

# Add no_of_rows and no_of_columns to the feature set
additional_features = df[['no_of_rows', 'no_of_columns']].values
X_combined = np.hstack((X, additional_features))

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y)

# Create a new DataFrame with the resampled data
resampled_df = pd.DataFrame({
    'folder': y_resampled,
    'no_of_rows': X_resampled[:, 768],
    'no_of_columns': X_resampled[:, 769],
    'embeddings': [embedding[:768] for embedding in X_resampled]
})


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    resampled_df[['no_of_rows', 'no_of_columns', 'embeddings']].values,
    resampled_df['folder'],
    test_size=0.2,
    random_state=42,
    stratify=resampled_df['folder']
)

# Split embeddings, no_of_rows, and no_of_columns
X_train_embeddings = np.array([x[2] for x in X_train])
X_train_additional = np.array([[x[0], x[1]] for x in X_train])

X_test_embeddings = np.array([x[2] for x in X_test])
X_test_additional = np.array([[x[0], x[1]] for x in X_test])


In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Standardize the additional features
scaler = StandardScaler()
X_train_additional = scaler.fit_transform(X_train_additional)
X_test_additional = scaler.transform(X_test_additional)

# Combine embeddings and additional features
X_train_combined = np.hstack((X_train_embeddings, X_train_additional))
X_test_combined = np.hstack((X_test_embeddings, X_test_additional))

# Initialize the model with class_weight parameter
classifier_model = RandomForestClassifier(random_state=42, class_weight='balanced')
classifier_model_02 = GradientBoostingClassifier(random_state=42)

# Train the classifier model
classifier_model.fit(X_train_combined, y_train)
classifier_model_02.fit(X_train_combined, y_train)

# Predictions
y_pred = classifier_model.predict(X_test_combined)
y_pred_02 = classifier_model_02.predict(X_test_combined)


# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Accuracy_02:", accuracy_score(y_test, y_pred_02))

print("classification_report: ", classification_report(y_test, y_pred))
print("classification_report_02: ", classification_report(y_test, y_pred_02))


Accuracy: 0.9385749385749386
Accuracy_02: 0.9418509418509419
classification_report:                    precision    recall  f1-score   support

  Balance Sheets       1.00      0.98      0.99       244
       Cash Flow       0.99      1.00      1.00       244
Income Statement       0.92      0.98      0.95       244
           Notes       0.88      0.92      0.90       245
          Others       0.90      0.82      0.86       244

        accuracy                           0.94      1221
       macro avg       0.94      0.94      0.94      1221
    weighted avg       0.94      0.94      0.94      1221

classification_report_02:                    precision    recall  f1-score   support

  Balance Sheets       1.00      0.98      0.99       244
       Cash Flow       0.99      1.00      1.00       244
Income Statement       0.94      0.98      0.96       244
           Notes       0.87      0.93      0.90       245
          Others       0.91      0.82      0.86       244

        accur

In [16]:
import pickle

with open('model_pkl', 'wb') as files:
    pickle.dump(classifier_model, files)

with open('model_02_pkl', 'wb') as files:
    pickle.dump(classifier_model_02, files)

In [11]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.36.0-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.1 (from gradio)
  Downloading gradio_client-1.0.1-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [17]:
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
from bs4 import BeautifulSoup
import numpy as np
import joblib

# Load the trained classifier model
model = joblib.load('/content/model_02_pkl')

# Load tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
embedding_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model.to(device)

# Function to get embeddings for a single text
def get_single_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().squeeze()

# Function to get the number of columns and rows
def get_table_dimensions(soup):
    tables = soup.find_all('table')
    if tables:
        table = tables[0]
        rows = table.find_all('tr')
        no_of_rows = len(rows)
        if rows:
            no_of_columns = len(rows[0].find_all(['th', 'td']))
        else:
            no_of_columns = 0
    else:
        no_of_rows = no_of_columns = 0
    return no_of_rows, no_of_columns

# Function to classify the uploaded HTML file
def classify_html_file(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ')
    no_of_rows, no_of_columns = get_table_dimensions(soup)
    embedding = get_single_embedding(text).reshape(1, -1)
    additional_features = np.array([[no_of_rows, no_of_columns]])
    additional_features = scaler.transform(additional_features)
    combined_features = np.hstack((embedding, additional_features))
    prediction = model.predict(combined_features)
    return prediction[0]

# Create the Gradio interface
iface = gr.Interface(
    fn=classify_html_file,
    inputs=gr.File(label="Upload an HTML file"),
    outputs=gr.Textbox(label="Predicted Category")
)

# Launch the Gradio app
iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://9e72d2e8a526219df9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


