In [1]:
import pke
from pke.lang import stopwords
import string
import re

In [2]:
def remove_markdown_formatting(text):
    # Remove code blocks enclosed in triple backticks
    pattern_code_blocks = r"```[^\\S\\r\\n]*[a-z]*(?:\\n(?!```$).*)*\\n```"
    text_without_code_blocks = re.sub(pattern_code_blocks, '', text, 0, re.DOTALL)

    # Remove asterisks and hash symbols
    pattern_asterisks = r"\*+"
    pattern_hashes = r"#"
    text_without_formatting = re.sub(pattern_asterisks, '', text_without_code_blocks)
    text_without_formatting = re.sub(pattern_hashes, '', text_without_formatting)

    return text_without_formatting

In [3]:
import os
import markdown

data = dict()

for root, dirs, files in os.walk('../data'):
    for file in files:
        with open(os.path.join(root, file), 'r') as f:
            raw_text = remove_markdown_formatting(f.read())
            data[file] = raw_text

In [4]:
import pandas as pd
df = pd.DataFrame(data.items(), columns=['File', 'Text'])
df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,File,Text
0,Business Proposal.md,\n Company Name: Instagram\n Category of Docum...
1,Marketing Plan.md,\n Company Name: Instagram\n Category of Docum...
2,Progress Report.md,Instagram Progress Report\n\n| Project Name ...
3,About Instagram.md,About Instagram\n\nIntroduction\n\nInstagram ...
4,Board of Directors.md,Board of Directors\n\nThe Instagram Board of ...
...,...,...
75,Employee Stock Ownership Plan (ESOP).md,Employee Stock Ownership Plan (ESOP)\n\nPlan S...
76,Employment Contract.md,Employment Contract\n\nEmployee: John Doe\nEmp...
77,employee_payslip_template.md,Employee Payslip\n\n Employee Information:\n-...
78,employee_stock_ownership_plan_template.md,Employee Stock Ownership Plan (ESOP)\n\n Empl...


In [5]:
def keyword_yake(text):
    extractor = pke.unsupervised.YAKE()
    stoplist = stopwords.get('english')
    extractor.load_document(input=text,
                            language='en',
                            stoplist=stoplist,
                            normalization=None)
    extractor.candidate_selection(n=3)
    window = 2
    use_stems = False
    extractor.candidate_weighting(window=window,
                                use_stems=use_stems)
    threshold = 0.8
    keyphrases = extractor.get_n_best(n=10, threshold=threshold)
    keyphrases = [keyphrase for keyphrase, score in keyphrases]
    return keyphrases

In [6]:
import spacy
def extract_metadata(text):
    nlp1 = spacy.load(R"output/model-best")
    doc = nlp1(text)
    tags = []
    label_dict = {}
    for ent in doc.ents:
        tags.append(ent.text)
        if ent.label_ not in label_dict:
            label_dict[ent.label_] = [ent.text]
        else:
            label_dict[ent.label_].append(ent.text)
    return label_dict

In [32]:
def date_extraction(text):
    # Updated regex pattern
    pattern = r'(\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}\b|\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b|\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}\b)'
    dates = re.findall(pattern, text)
    return dates

In [8]:
def get_uniqueset(df):
    for i in range(len(df)):
        for col in df.columns:
            try:
                df[col][i] = list(set(df[col][i]))
            except:
                pass
    return df   

In [9]:
def metadata_extract(df):
    metadata = pd.DataFrame()
    # initialize keyphrase extraction model, here YAKE
    keywords = df['Text'].apply(lambda x: keyword_yake(x))
    # keyword = [x[0] for x in keywords]
    # NER Metadata Generation
    labels = df['Text'].apply(lambda x: extract_metadata(x))
    metadata['Keywords'] = keywords
    metadata['Person'] = labels.apply(lambda x: x.get('Person'))
    metadata['Organization'] = labels.apply(lambda x: x.get('Organization'))
    metadata['Money'] = labels.apply(lambda x: x.get('Money'))
    metadata['Contact_Number'] = labels.apply(lambda x: x.get('Contact_Number'))
    metadata['Email Id'] = labels.apply(lambda x: x.get('Email Id'))
    metadata['Date'] = df['Text'].apply(lambda x: date_extraction(x))
    metadata = get_uniqueset(metadata)
    return metadata

In [33]:
metadata = metadata_extract(df)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[col][i] = list(set(df[col][i]))
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Seri

In [34]:
metadata.to_csv('metadata.csv', index=False)