<a href="https://colab.research.google.com/github/Arunkarthik-K/Assessment_FinacPlus/blob/main/Assessment_XGBClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip data.zip

In [2]:
import os
import pandas as pd
from bs4 import BeautifulSoup

# Path to your data
data_path = "/content/data"

# Initialize an empty list to store data
data = []

# Define keywords for each class
keywords = {
    "Income Statements": ["revenue", "net income", "expenses", "profit", "loss"],
    "Balance Sheets": ["assets", "liabilities", "equity", "accounts receivable", "accounts payable"],
    "Cash Flows": ["cash flow", "operating activities", "investing activities", "financing activities"],
    "Notes": ["footnotes", "disclosures", "note", "accounting policies", "financial statements"],
    "Others": ["miscellaneous", "other", "additional", "various"]
}

# Function to get the number of columns and rows
def get_table_dimensions(soup):
    tables = soup.find_all('table')
    if tables:
        table = tables[0]
        rows = table.find_all('tr')
        no_of_rows = len(rows)
        if rows:
            no_of_columns = len(rows[0].find_all(['th', 'td']))
        else:
            no_of_columns = 0
    else:
        no_of_rows = no_of_columns = 0
    return no_of_rows, no_of_columns

# Function to count keyword occurrences
def count_keywords(text, keywords):
    counts = {key: sum(text.count(word) for word in words) for key, words in keywords.items()}
    return counts

# Iterate through each folder
for folder in os.listdir(data_path):
    folder_path = os.path.join(data_path, folder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            if file_name.endswith(".html"):
                with open(file_path, 'r', encoding='utf-8') as file:
                    soup = BeautifulSoup(file, 'html.parser')
                    text = soup.get_text(separator=' ')
                    no_of_rows, no_of_columns = get_table_dimensions(soup)
                    keyword_counts = count_keywords(text, keywords)
                    data.append({
                        "text": text,
                        "file_name": file_name,
                        "folder": folder,
                        "no_of_rows": round(no_of_rows, 2),
                        "no_of_columns": round(no_of_columns, 2),
                        **keyword_counts
                    })

# Convert to DataFrame
df = pd.DataFrame(data)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2510 entries, 0 to 2509
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               2510 non-null   object
 1   file_name          2510 non-null   object
 2   folder             2510 non-null   object
 3   no_of_rows         2510 non-null   int64 
 4   no_of_columns      2510 non-null   int64 
 5   Income Statements  2510 non-null   int64 
 6   Balance Sheets     2510 non-null   int64 
 7   Cash Flows         2510 non-null   int64 
 8   Notes              2510 non-null   int64 
 9   Others             2510 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 196.2+ KB


In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
embedding_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model.to(device)

# Function to get embeddings
def get_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = embedding_model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Apply to the DataFrame
texts = df['text'].tolist()
df['embeddings'] = get_embeddings(texts)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2510 entries, 0 to 2509
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               2510 non-null   object
 1   file_name          2510 non-null   object
 2   folder             2510 non-null   object
 3   no_of_rows         2510 non-null   int64 
 4   no_of_columns      2510 non-null   int64 
 5   Income Statements  2510 non-null   int64 
 6   Balance Sheets     2510 non-null   int64 
 7   Cash Flows         2510 non-null   int64 
 8   Notes              2510 non-null   int64 
 9   Others             2510 non-null   int64 
 10  embeddings         2510 non-null   object
dtypes: int64(7), object(4)
memory usage: 215.8+ KB


In [7]:
from imblearn.over_sampling import SMOTE
import numpy as np

# Prepare data for SMOTE
X = np.array(df['embeddings'].tolist())
y = df['folder']

# Add no_of_rows, no_of_columns, and keyword counts to the feature set
additional_features = df[['no_of_rows', 'no_of_columns'] + list(keywords.keys())].values
X_combined = np.hstack((X, additional_features))

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y)

# Create a new DataFrame with the resampled data
resampled_df = pd.DataFrame({
    'embeddings': [embedding[:768] for embedding in X_resampled[:, :768]],
    'no_of_rows': np.ceil(X_resampled[:, 768]).astype(int),
    'no_of_columns': np.ceil(X_resampled[:, 769]).astype(int),
    **{keyword: np.ceil(X_resampled[:, 770 + i]).astype(int) for i, keyword in enumerate(keywords.keys())},
    'folder': y_resampled
})


In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
resampled_df['folder_encoded'] = label_encoder.fit_transform(resampled_df['folder'])


In [10]:
import pickle

with open('label_model_pkl', 'wb') as files:
    pickle.dump(label_encoder, files)

In [11]:
resampled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6105 entries, 0 to 6104
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   embeddings         6105 non-null   object
 1   no_of_rows         6105 non-null   int64 
 2   no_of_columns      6105 non-null   int64 
 3   Income Statements  6105 non-null   int64 
 4   Balance Sheets     6105 non-null   int64 
 5   Cash Flows         6105 non-null   int64 
 6   Notes              6105 non-null   int64 
 7   Others             6105 non-null   int64 
 8   folder             6105 non-null   object
 9   folder_encoded     6105 non-null   int64 
dtypes: int64(8), object(2)
memory usage: 477.1+ KB


In [12]:
resampled_df.head()

Unnamed: 0,embeddings,no_of_rows,no_of_columns,Income Statements,Balance Sheets,Cash Flows,Notes,Others,folder,folder_encoded
0,"[0.006729502230882645, 0.026193059980869293, 0...",11,3,0,0,2,0,0,Cash Flow,1
1,"[-0.1101217269897461, -0.290446400642395, 0.41...",46,3,6,7,3,0,2,Cash Flow,1
2,"[-0.04318363592028618, -0.020249444991350174, ...",23,3,0,1,2,3,1,Cash Flow,1
3,"[-0.09732430428266525, 0.1360766738653183, 0.5...",32,3,0,3,0,0,1,Cash Flow,1
4,"[-0.008356556296348572, -0.07625875622034073, ...",11,3,0,0,2,0,0,Cash Flow,1


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    resampled_df[['no_of_rows', 'no_of_columns', 'Income Statements', 'Balance Sheets', 'Cash Flows', 'Notes', 'Others', 'embeddings']].values,
    resampled_df['folder_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=resampled_df['folder_encoded']
)

# Split embeddings, no_of_rows, no_of_columns, and keyword counts
X_train_embeddings = np.array([x[7] for x in X_train])
X_train_additional = np.array([[x[0], x[1], x[2], x[3], x[4], x[5], x[6]] for x in X_train])

X_test_embeddings = np.array([x[7] for x in X_test])
X_test_additional = np.array([[x[0], x[1], x[2], x[3], x[4], x[5], x[6]] for x in X_test])


In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Standardize the additional features
scaler = StandardScaler()
X_train_additional = scaler.fit_transform(X_train_additional)
X_test_additional = scaler.transform(X_test_additional)

# Combine embeddings and additional features
X_train_combined = np.hstack((X_train_embeddings, X_train_additional))
X_test_combined = np.hstack((X_test_embeddings, X_test_additional))

# Initialize the model with class_weight parameter
classifier_model = RandomForestClassifier(random_state=42, class_weight='balanced')
classifier_model_02 = GradientBoostingClassifier(random_state=42)
classifier_model_03 = XGBClassifier(random_state=42, scale_pos_weight='balanced')

# Train the classifier model
classifier_model.fit(X_train_combined, y_train)
classifier_model_02.fit(X_train_combined, y_train)
classifier_model_03.fit(X_train_combined, y_train)


Parameters: { "scale_pos_weight" } are not used.



In [16]:
with open('model_pkl', 'wb') as files:
    pickle.dump(classifier_model, files)

with open('model_02_pkl', 'wb') as files:
    pickle.dump(classifier_model_02, files)

with open('model_03_pkl', 'wb') as files:
    pickle.dump(classifier_model_03, files)

In [17]:
# load the model
load_model = pickle.load(open('model_pkl', 'rb'))
load_model_02 = pickle.load(open('model_02_pkl', 'rb'))
load_model_03 = pickle.load(open('model_03_pkl', 'rb'))

# Predictions
y_pred = load_model.predict(X_test_combined)
y_pred_02 = load_model_02.predict(X_test_combined)
y_pred_03 = load_model_03.predict(X_test_combined)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Accuracy_02:", accuracy_score(y_test, y_pred_02))
print("Accuracy_03:", accuracy_score(y_test, y_pred_03))

print("classification_report: ", classification_report(y_test, y_pred))
print("classification_report_02: ", classification_report(y_test, y_pred_02))
print("classification_report_03: ", classification_report(y_test, y_pred_03))


Accuracy: 0.9533169533169533
Accuracy_02: 0.9574119574119574
Accuracy_03: 0.9680589680589681
classification_report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       244
           1       1.00      1.00      1.00       244
           2       0.95      0.99      0.97       244
           3       0.93      0.91      0.92       245
           4       0.90      0.86      0.88       244

    accuracy                           0.95      1221
   macro avg       0.95      0.95      0.95      1221
weighted avg       0.95      0.95      0.95      1221

classification_report_02:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       244
           1       0.99      1.00      1.00       244
           2       0.97      0.99      0.98       244
           3       0.91      0.93      0.92       245
           4       0.91      0.87      0.89       244

    accuracy                           0.

In [18]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.36.0-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.1 (from gradio)
  Downloading gradio_client-1.0.1-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [19]:
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
from bs4 import BeautifulSoup
import numpy as np
import joblib

# Load the trained classifier model
model = joblib.load('/content/model_03_pkl')

# Load tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
embedding_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model.to(device)

# Load the label encoder
label_encoder_model = joblib.load('/content/label_model_pkl')

# Function to get embeddings for a single text
def get_single_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().squeeze()

# Function to get the number of columns and rows
def get_table_dimensions(soup):
    tables = soup.find_all('table')
    if tables:
        table = tables[0]
        rows = table.find_all('tr')
        no_of_rows = len(rows)
        if rows:
            no_of_columns = len(rows[0].find_all(['th', 'td']))
        else:
            no_of_columns = 0
    else:
        no_of_rows = no_of_columns = 0
    return no_of_rows, no_of_columns

# Function to count keyword occurrences
def count_keywords(text, keywords):
    counts = [sum(text.count(word) for word in words) for words in keywords.values()]
    return counts

# Function to classify the uploaded HTML file
def classify_html_file(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ')
    no_of_rows, no_of_columns = get_table_dimensions(soup)

    text_embedding = get_single_embedding(text).reshape(1, -1)
    additional_features = np.array([[round(no_of_rows, 2), round(no_of_columns, 2)] + count_keywords(text, keywords)])

    # Combine embeddings and additional features
    combined_features = np.hstack((text_embedding, additional_features))

    # Predict class
    prediction = model.predict(combined_features)[0]

    # Decode class label
    predicted_class = label_encoder_model.inverse_transform([prediction])[0]

    return predicted_class

# Interface
html_file_input = gr.File(label="Upload HTML File")
output_text = gr.Textbox(label="Predicted Class")

# Create Gradio interface
gr.Interface(fn=classify_html_file, inputs=html_file_input, outputs=output_text, title="HTML File Classifier").launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e4aad61ce8e2bf01ad.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


