In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import re

In [4]:
file = "AccountingInTheFinanceWorld.html"
document = ""

with open(file, 'r', encoding = 'utf-8') as file:
    document = file.read()

In [5]:
parser = BeautifulSoup(document, 'html.parser')

In [6]:
text = parser.get_text()

In [7]:
combinedQA = text.split("Question: ")
combinedQA.pop(0) #remove table of contents and other content before questions

separateQA = []
failed_splits = []

for QA in combinedQA:
    data = QA.split("Answer: ") #split on the answer

    # only process split if it correctly split into 2 parts
    if len(data) == 2:
        separateQA.append(data[0])
        separateQA.append(data[1]) #if len() is not > 1 it doesn't have a question-answer pair
    else:
        failed_splits.append(QA)  # Store failed split

questions = separateQA[0::2]
answers = separateQA[1::2]

df = pd.DataFrame(zip(questions, answers), columns=["Question","Answer"])
# Create DataFrame for the failed splits
failed_df = pd.DataFrame(failed_splits, columns=["Failed QA"])

In [8]:
df.tail()

Unnamed: 0,Question,Answer
262,After all noncash and nonoperating items are r...,Although the procedures appear to be different...
263,When reporting cash flows from operating activ...,Authoritative pronouncements that create U.S. ...
264,"For the year ended September 27, 2008, The Wal...","In most cases, an accountant takes the ledger ..."
265,"For the year ended December 28, 2008, Johnson ...","As has been indicated, financing activities re..."
266,The three sections of the statement of cash fl...,"In both the direct and indirect methods, cash ..."


In [9]:
failed_df.tail()

Unnamed: 0,Failed QA
31,"""Authorized,"" ""issued,"" and ""par value"" are te..."
32,Investors in the United States seem to have an...
33,Liberto has one revenue and three expenses lef...
34,Any company that follows U.S. GAAP and issues ...
35,Many investors watch the movement of a company...


In [10]:
df["QuestionWordCount"] = df.Question.apply(lambda q: len(q.split()))
df["AnswerWordCount"] = df.Answer.apply(lambda a: len(a.split()))
df.describe()

Unnamed: 0,QuestionWordCount,AnswerWordCount
count,267.0,267.0
mean,94.58427,482.213483
std,65.866545,1124.398932
min,4.0,56.0
25%,50.0,215.5
50%,78.0,356.0
75%,117.5,544.0
max,458.0,18221.0


In [None]:
df.to_excel("question-answering-data.xlsx")

In [None]:
failed_df.to_excel("Failed_Questions_Answers.xlsx")

In [11]:
def clean_text(text):
    # phrases to remove
    remove = [
        r'In Figure', r'Chapter', r'Previously', r'Transaction',
        r'This textbook', r'Thus far in this textbook', r'Throughout this textbook',
        r'As stated in Chapter', r'In previous', r'The previous section'
    ]

    # Creating a regex pattern by joining all phrases with '|', meaning "or"
    pattern = r'(' + '|'.join(remove) + r').*?[.!?]'

    # Remove sentences that start with any of the specified phrases
    text = re.sub(pattern, '', text)

    # Remove references like "[number]", "(ref)", or variants like "Ref. [1]"
    text = re.sub(r'\[\d+\]|\(ref\)|Ref\. \[\d+\]', '', text)

    # Return the cleaned text
    return text
df["New_Question"] = df["Question"].apply(clean_text)
df["New_Answer"] = df["Answer"].apply(clean_text)

In [12]:
df.head()

Unnamed: 0,Question,Answer,QuestionWordCount,AnswerWordCount,New_Question,New_Answer
0,This textbook professes to be an introduction ...,"In simplest terms, financial accounting is the...",28,176,A logical place to begin such an exploration ...,"In simplest terms, financial accounting is the..."
1,"Every semester, most college students are enro...",Many possible benefits can be gained from acqu...,60,281,"Every semester, most college students are enro...",Many possible benefits can be gained from acqu...
2,Knowledge of financial accounting assists indi...,The number of possible judgments that an indiv...,62,586,Knowledge of financial accounting assists indi...,The number of possible judgments that an indiv...
3,A great number of possible decisions could be ...,Organizational decisions such as these are ext...,75,352,A great number of possible decisions could be ...,Organizational decisions such as these are ext...
4,Financial accounting refers to the conveyance ...,"As indicated, financial accounting is designed...",69,474,Financial accounting refers to the conveyance ...,"As indicated, financial accounting is designed..."


In [13]:
col = ['New_Question', 'New_Answer']
new_df= df[col]

In [14]:
new_df.head()

Unnamed: 0,New_Question,New_Answer
0,A logical place to begin such an exploration ...,"In simplest terms, financial accounting is the..."
1,"Every semester, most college students are enro...",Many possible benefits can be gained from acqu...
2,Knowledge of financial accounting assists indi...,The number of possible judgments that an indiv...
3,A great number of possible decisions could be ...,Organizational decisions such as these are ext...
4,Financial accounting refers to the conveyance ...,"As indicated, financial accounting is designed..."


In [15]:
new_df.loc[:, "NewQuestionWordCount"] = new_df["New_Question"].apply(lambda q: len(q.split()))
new_df.loc[:, "NewAnswerWordCount"] = new_df["New_Answer"].apply(lambda a: len(a.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.loc[:, "NewQuestionWordCount"] = new_df["New_Question"].apply(lambda q: len(q.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.loc[:, "NewAnswerWordCount"] = new_df["New_Answer"].apply(lambda a: len(a.split()))


In [None]:
new_df.to_excel("new-question-answering-data1.xlsx")

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
# Function to extract sentences containing highest TF-IDF words
def extract_relevant_text(text, top_n=2):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)

    if len(sentences) == 0:
        return ''

    # Apply TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Get the mean TF-IDF score for each sentence
    sentence_scores = np.asarray(tfidf_matrix.mean(axis=1)).flatten()  # Flatten matrix to get an array of scores

    # Get indices of top n sentences
    top_sentence_indices = np.argsort(sentence_scores)[-top_n:]  # Get indices of top sentences

    # Extract the relevant sentences
    relevant_sentences = [sentences[i] for i in top_sentence_indices]

    return ' '.join(relevant_sentences)

In [19]:
 # Assuming your DataFrame is called new_df and columns are 'New_Question' and 'New_Answer'
new_df['Relevant_Question'] = new_df['New_Question'].apply(lambda x: extract_relevant_text(x, top_n=1))
new_df['Relevant_Answer'] = new_df['New_Answer'].apply(lambda x: extract_relevant_text(x, top_n=2))

# Show the DataFrame with the relevant parts
new_df[['New_Question', 'Relevant_Question', 'New_Answer', 'Relevant_Answer']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Relevant_Question'] = new_df['New_Question'].apply(lambda x: extract_relevant_text(x, top_n=1))


Unnamed: 0,New_Question,Relevant_Question,New_Answer,Relevant_Answer
0,A logical place to begin such an exploration ...,A logical place to begin such an exploration ...,"In simplest terms, financial accounting is the...",Whether it is gathering financial information ...
1,"Every semester, most college students are enro...",Why should a student invest valuable time to l...,Many possible benefits can be gained from acqu...,"1 Thus, the ultimate purpose of this book is t..."
2,Knowledge of financial accounting assists indi...,"For example, assume that a former student—one ...",The number of possible judgments that an indiv...,"Many economic choices, such as those described..."
3,A great number of possible decisions could be ...,Should a business buy a building to serve as i...,Organizational decisions such as these are ext...,Accounting is then further subdivided into (a)...
4,Financial accounting refers to the conveyance ...,Is there any reason for a person who is employ...,"As indicated, financial accounting is designed...",Individuals who attain a proper level of knowl...


In [None]:
new_df.to_excel('relevant_questions_answers.xlsx', index=False)