<a href="https://colab.research.google.com/github/GDharan10/Project8_ClassificationModelForFinancialStatements/blob/main/Classification_Model_For_Financial_Statementsipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Libraries**

In [67]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Data collection**

In [38]:
# Function to clean text
def clean_text(text):
    # Remove non-alphanumeric characters, except spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove lowercase letters if they appear separately
    text = re.sub(r'\b[a-z]\b', '', text)
    # Remove uppercase letters if they appear separately
    text = re.sub(r'\b[A-Z]\b', '', text)
    # Remove Roman numerals
    roman_numerals = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'IX', 'X', 'XIII', 'XIV', 'XV', 'XVI', 'XVII',
                      'XVIII', 'XIX', 'XX', 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'i', 'ii', 'iii',
                      'iv', 'v', 'vi']
    for numeral in roman_numerals:
        text = re.sub(r'\b{}\b'.format(numeral), '', text)
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [39]:
# Folder path containing HTML files
folder_path = "/content/drive/MyDrive/Task/FinacPlus/data/data/Balance Sheets"

# Initialize an empty list to store DataFrames
dfs_list = []

# Process each HTML file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        file_path = os.path.join(folder_path, filename)
        # Read HTML tables into a list of DataFrame objects
        dfs = pd.read_html(file_path)
        # Assuming the desired table is the first one
        table_df = dfs[0]
        # Extract text from the first two columns of the first table
        extracted_text = ' '.join(str(cell) for cell in table_df.iloc[:, :2].values.flatten() if pd.notnull(cell))
        # Clean the extracted text
        cleaned_text = clean_text(extracted_text)
        # Create a DataFrame with cleaned text and document name
        df = pd.DataFrame({'Extracted_text': [cleaned_text], 'document_name': "Balance Sheets"})
        # Append the DataFrame to the list
        dfs_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
BalanceSheets_df = pd.concat(dfs_list, ignore_index=True)

# Display the final DataFrame
print(BalanceSheets_df)


                                        Extracted_text   document_name
0    SI Particulars No No ASSETS NonCurrent Assets ...  Balance Sheets
1    STANDALONE March Particulars Audited ASSETS No...  Balance Sheets
2    GRINDWELL NORTON LIMITED Statement of Standalo...  Balance Sheets
3    ASSETS NonCurrent Assets Property Plant and Eq...  Balance Sheets
4    SIGNED FOR IDENTIFICATIO CD CD SRBCCO LLP SRBC...  Balance Sheets
..                                                 ...             ...
265  AUDITED CONSOLIDATED STATEMENT OF ASSETS AND L...  Balance Sheets
266  Notes to standalone ml AS financial results fo...  Balance Sheets
267  Sr No Particulars ASSETS NonCurrent Assets Pro...  Balance Sheets
268  Particulars Particulars ASSETS Noncurrent asse...  Balance Sheets
269  Sr No Particulars Assets NonCurrent Assets Pro...  Balance Sheets

[270 rows x 2 columns]


In [40]:
# Define the folder path
folder_path = "/content/drive/MyDrive/Task/FinacPlus/data/data/Cash Flow"

# Initialize an empty list to store DataFrames
dfs_list = []

# Process each HTML file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        # Read HTML file into a DataFrame
        table_df = pd.read_html(os.path.join(folder_path, filename))[0]

        # Extract text from the first two columns of the first table
        extracted_text = ' '.join(str(cell) for cell in table_df.iloc[:, :2].values.flatten() if pd.notnull(cell))

        # Clean the extracted text
        cleaned_text = clean_text(extracted_text)

        # Create a DataFrame with the extracted text and document name
        new_df = pd.DataFrame({'Extracted_text': [cleaned_text], 'document_name': "CashFlow"})

        # Append the DataFrame to the list
        dfs_list.append(new_df)

# Concatenate all DataFrames in the list into a single DataFrame
CashFlow_df = pd.concat(dfs_list, ignore_index=True)

# Display the final DataFrame
print(CashFlow_df)

                                       Extracted_text document_name
0   PARTICULARS As on Net profit before tax ADD De...      CashFlow
1   Cash flows from financing activities Decrease ...      CashFlow
2   Year ended st March CASH FLOW FROM OPERATING A...      CashFlow
3   DESCRIPTION DESCRIPTION Cash Inflow Outflow Fr...      CashFlow
4   INR in Crores Particulars For the year ended M...      CashFlow
5   Particulars Particulars Cash Flow from Operati...      CashFlow
6   Cash flow from operating activities Profit bef...      CashFlow
7   CASH FLOW FROM OPERATING ACTIVITIES Net Profit...      CashFlow
8   Cash Flow from Operating Activities Net Profit...      CashFlow
9   Particulars Year ended December Cash flow From...      CashFlow
10  Particulars Cash Flow from operating activitie...      CashFlow
11  As at December CASH FLOW FROM FINANCING ACTIVI...      CashFlow
12  CASH FLOW FROM INVESTING ACTIVITIES CASH FLOW ...      CashFlow
13  Year ended st March CASH FLOW FROM FINANCING

In [41]:
# Folder path
folder_path = "/content/drive/MyDrive/Task/FinacPlus/data/data/Income Statement"

# Initialize an empty list to store dataframes
dfs = []

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an HTML file
    if filename.endswith(".html"):
        # Read HTML file into a DataFrame
        table_df = pd.read_html(os.path.join(folder_path, filename))[0]

        # Extract text from the first two columns of the first table
        extracted_text = ' '.join(str(cell) for cell in table_df.iloc[:, :2].values.flatten() if pd.notnull(cell))

        # Clean the extracted text
        cleaned_text = clean_text(extracted_text)

        # Create a new DataFrame
        new_df = pd.DataFrame({'Extracted_text': [cleaned_text], 'document_name': "Income Statement"})

        # Append the new DataFrame to the list
        dfs.append(new_df)

# Concatenate all DataFrames in the list
IncomeStatement_df = pd.concat(dfs, ignore_index=True)

# Display the final DataFrame
print(IncomeStatement_df)

                                        Extracted_text     document_name
0    Particulars Income Revenue from Operations Oth...  Income Statement
1    Sr Particular No Income Revenue from operation...  Income Statement
2    Particulars Income Income from operations refe...  Income Statement
3    QUvfttr muled Particulars wined Rrrfi Income R...  Income Statement
4    Particulars Audited Particulars Consolidated R...  Income Statement
..                                                 ...               ...
300  SI Particulars No Particulars Revenue from ope...  Income Statement
301  KPlT Technologies Limited Registered Corporate...  Income Statement
302  Finolex Cables Limited Registered Office Mumba...  Income Statement
303  QFV Revenue Established Service Areas Revenue ...  Income Statement
304  Hi Hn Hi Hn member of member of Registered Off...  Income Statement

[305 rows x 2 columns]


In [42]:
# Define the folder path
folder_path = "/content/drive/MyDrive/Task/FinacPlus/data/data/Notes"

# Initialize an empty list to store DataFrames
dfs_list = []

# Process each HTML file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        # Read HTML file into a DataFrame
        table_df = pd.read_html(os.path.join(folder_path, filename))[0]

        # Extract text from the first two columns of the first table
        extracted_text = ' '.join(str(cell) for cell in table_df.iloc[:, :2].values.flatten() if pd.notnull(cell))

        # Clean the extracted text
        cleaned_text = clean_text(extracted_text)

        # Create a DataFrame with the extracted text and document name
        new_df = pd.DataFrame({'Extracted_text': [cleaned_text], 'document_name': "Notes"})

        # Append the DataFrame to the list
        dfs_list.append(new_df)

# Concatenate all DataFrames in the list into a single DataFrame
Notes_df = pd.concat(dfs_list, ignore_index=True)

# Display the final DataFrame
print(Notes_df)

                                        Extracted_text document_name
0    Particulars Year ended December Balance at beg...         Notes
1    As at st March Salaries wages bonus and other ...         Notes
2    in Crores in Crores Refer Note Financial liabi...         Notes
3    As at Rs Ps Profit After Tax Cr Weighted Avera...         Notes
4    No Particulars No Particulars RESERVES AND SUR...         Notes
..                                                 ...           ...
685  Freehold land Cost Cost At April Addition on a...         Notes
686  December INR in Lacs Excise duty on sale of goods         Notes
687  Particulars December Profit for the year Rs in...         Notes
688  NOTE OTHER FINANCIAL ASSETS LONG TERM NONCURRE...         Notes
689  Cash and cash equivalents Cash and cash equiva...         Notes

[690 rows x 2 columns]


In [43]:
# Define the folder path
folder_path = "/content/drive/MyDrive/Task/FinacPlus/data/data/Others"

# Initialize an empty list to store DataFrames
dfs_list = []

# Process each HTML file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        # Read HTML file into a DataFrame
        table_df = pd.read_html(os.path.join(folder_path, filename))[0]

        # Extract text from the first two columns of the first table
        extracted_text = ' '.join(str(cell) for cell in table_df.iloc[:, :2].values.flatten() if pd.notnull(cell))

        # Clean the extracted text
        cleaned_text = clean_text(extracted_text)

        # Create a DataFrame with the extracted text and document name
        new_df = pd.DataFrame({'Extracted_text': [cleaned_text], 'document_name': "Others"})

        # Append the DataFrame to the list
        dfs_list.append(new_df)

# Concatenate all DataFrames in the list into a single DataFrame
Others_df = pd.concat(dfs_list, ignore_index=True)

# Display the final DataFrame
print(Others_df)

                                         Extracted_text document_name
0     Consolidated Results Consolidated Results Year...        Others
1                                            QFY QFY RS        Others
2     Audit Qualification each audit qualification s...        Others
3     The consolidated financial results of Edelweis...        Others
4     Subsidiaries Subsidiaries Wipro LLC Wipro Gall...        Others
...                                                 ...           ...
1219  Exceptional Items continuing operations Amount...        Others
1220  Particulars Quarter ended Particulars Revenue ...        Others
1221  yL Firms Registration Number Rajiv Singhi Part...        Others
1222    Symbol Typeof security BSE Equity NSE GRINDWELL        Others
1223  Quarter Ended Particulars Mar Total revenues E...        Others

[1224 rows x 2 columns]


In [44]:
df = pd.concat([BalanceSheets_df, CashFlow_df, IncomeStatement_df, Notes_df, Others_df], ignore_index=True)
df

Unnamed: 0,Extracted_text,document_name
0,SI Particulars No No ASSETS NonCurrent Assets ...,Balance Sheets
1,STANDALONE March Particulars Audited ASSETS No...,Balance Sheets
2,GRINDWELL NORTON LIMITED Statement of Standalo...,Balance Sheets
3,ASSETS NonCurrent Assets Property Plant and Eq...,Balance Sheets
4,SIGNED FOR IDENTIFICATIO CD CD SRBCCO LLP SRBC...,Balance Sheets
...,...,...
2520,Exceptional Items continuing operations Amount...,Others
2521,Particulars Quarter ended Particulars Revenue ...,Others
2522,yL Firms Registration Number Rajiv Singhi Part...,Others
2523,Symbol Typeof security BSE Equity NSE GRINDWELL,Others


# **EDA (Exploratory Data Analysis)**

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2525 entries, 0 to 2524
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Extracted_text  2525 non-null   object
 1   document_name   2525 non-null   object
dtypes: object(2)
memory usage: 39.6+ KB


In [11]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Extracted_text,2525,2413,,24
document_name,2525,5,Others,1224


In [12]:
df.isnull().sum()

Extracted_text    0
document_name     0
dtype: int64

In [47]:
df.duplicated().sum()

110

In [14]:
df.document_name.value_counts()

document_name
Others              1224
Notes                690
Income Statement     305
Balance Sheets       270
CashFlow              36
Name: count, dtype: int64

# **Data_Preprocessing**

In [48]:
df.drop_duplicates(inplace = True)

In [53]:
df.columns

Index(['Extracted_text', 'document_name'], dtype='object')

In [49]:
df.document_name.value_counts()

document_name
Others              1138
Notes                668
Income Statement     305
Balance Sheets       268
CashFlow              36
Name: count, dtype: int64

In [51]:
# Apply the cleaning function to the Extracted_text column
df['Extracted_text'] = df['Extracted_text'].apply(clean_text)

In [60]:
# stop words removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [63]:
ps = PorterStemmer()
nltk.download('punkt')
def preprocess_text(text):
    words = word_tokenize(text)
    words = [ps.stem(word) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

df['Extracted_text'] = df['Extracted_text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [65]:
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf.fit_transform(df['Extracted_text']).toarray()
Y = label_encoder.fit_transform(df['document_name'])

In [66]:
X, y

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 0       Balance Sheets
 1       Balance Sheets
 2       Balance Sheets
 3       Balance Sheets
 4       Balance Sheets
              ...      
 2519            Others
 2520            Others
 2522            Others
 2523            Others
 2524            Others
 Name: document_name, Length: 2415, dtype: object)

# end