### import libraries

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import random
from pathlib import Path

### xml to df helper functions 

In [None]:
def get_xml_soup(file_path):
    """Get the root element of an XML file into a soup object.

    Args:   
        file_path (str): The path to the XML file.

    Returns:
        Element: the soup element of the XML file.
    """
    with open(file_path, 'r') as file:
        xml_data = file.read()
    
    soup = BeautifulSoup(xml_data, 'xml')
    record = soup.find('RECORD')
    return record


def get_article_text(soup):
    """get a soup object and return the a list of paragraphs.

    Args:
        record (a soup object with an HTML parser): The soup object to be parsed.

    Returns:
        _str_: a string of the text.
    """
    text = soup.find('Text').get_text() # Get the text content of the XML file
    if text is None:
        return None
    html_soup = BeautifulSoup(text, 'html.parser') # Parse the text content as HTML
    
    paragraphs = html_soup.find_all('p') # Find all paragraph tags in the HTML content
    paragraphs_str = '' # Initialize an empty string to store the paragraphs
    for p in paragraphs:
        if '@' in p.text:
            if len(p.text.split(' ')) > 2:
                if not 'Credit:' in p.text:
                    paragraphs_str += p.text.strip().lower()
            
    return paragraphs_str


def get_xml_to_dict(file_path, property_list, property_names):
    """Parse an XML file and return a dictionary of its contents.

    Args:
        file_path (str): The path to the XML file.
    """
    record = get_xml_soup(file_path)  # Get the root element of the XML file
    contect_dict = {} # Initialize an empty dictionary to store the details
    
    # Iterate over the properties to extract
    for i in range(len(property_names)):
        prop = record.find(property_list[i])
        if prop is not None:
            contect_dict[property_names[i]] = prop.text.strip().lower()
        else:
            contect_dict[property_names[i]] = None
            #print(f"Property {property_list[i]} not found")

    text = get_article_text(record)
    if text:
        contect_dict['Text'] = text
            
    return contect_dict


def get_properties():
    """Get the properties to extract from the XML files.

    Returns:
        _tuple_: _return a tuple of two list of propertiy names and tags_
    """
    property_tags = ['GOID', 'SortTitle','Title', 
                    'NumericDate', 'Language', 
                    'StartPage', 'DocSection', 
                    'mstar', 'DocEdition', 'GenSubjTerm', 
                    'CompanyName', 'Personal',   
                    'LastName', 'FirstName', 
                    'LexileScore',
                    ]
    
    property_names = ['GOID', 'Publisher', 'Title', 
                  'Date', 'Language', 
                  'Page', 'Section', 
                  'Type', 'Edition', 'Tags',
                  'Company Name', 'Personal',
                  'Author Last Name', 'Author First Name', 
                  'Lexile Score', 
                  ]

    return property_tags, property_names

### method helper functions 

In [5]:

def is_economic_article(dict):
    """_classify if article is economic using topic analysis_

    Args:
        dict (_dict_): _article content_
    Returns:
        _bool_: _return True if article is economic_
    """
    return True # TODO: Implement this function


def get_article_sentiment(paragraph_list):
    """_get the sentiment of a list of paragraphs_

    Args:
        paragraph_list (_list_): _a list of paragraphs_
    Returns:
        _float__: _the sentiment of the paragraphs, a number between -1 and 1_
    """
    # genarate a rendom number between -1 and 1
    sentimet_score = random.uniform(-1, 1)
    return sentimet_score # TODO: Implement this function


def get_article_weight(dict):
    """_get the weight of an article_

    Args:
        dict (_dict_): _article content_
    Returns:
        _float_: _the weight of the article_
    """
    return 1 # TODO: Implement this function
    

### execution 

In [None]:
# choose the dataset to work with

#TheNewYorkTimes_sample20
#USAToday_sample20
#LosAngelesTimes_sample20
#TheWashingtonPost_sample25

dataset_name_list = ['TheNewYorkTimes_sample20', 'USAToday_sample20', 'LosAngelesTimes_sample20', 'TheWashingtonPost_sample25']
project_path = Path("c:/Users/pc/Documents/work/bank of israel/financial division/yossi/tdm-sentiment")
data_path = project_path / 'data'
corpus_name = 'LosAngelesTimes_sample20'

# get list of files in the data path
corpus_path = data_path / 'corpuses' / corpus_name
file_name_list = file_list = [f.name for f in corpus_path.glob('*.xml')]


# Initialize an empty list to hold the content dictionaries
content_list = []
non_economic_articles = []

# get the properties to extract
property_tags, property_names = get_properties()

# Iterate over the list of file names
for file_name in file_name_list:
    file_path = corpus_path / file_name
    content_dict = get_xml_to_dict(file_path, property_tags, property_names) # get content of the file
    
    # check if the article is an economic article
    economic = is_economic_article(content_dict)
    if not economic:
        non_economic_articles.append(file_name)
        continue 
    
    # add the content_dict to the content_list
    content_list.append(content_dict)



# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(content_list)

KeyError: 'Text'

In [None]:
def xmls_to_df(file_name_list, property_tags, property_names):
    """
    Args:
        file_name_list (list): _a list of file names_
        property_tags (list): _description_
        property_names (_type_): _description_
    return: a df 
    """

In [None]:
df1 = df.copy()
# get Date column in datetime format from format 'YYYY-MM-DD'
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
#df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d', errors='coerce')
# sort the dataframe by Date
df = df.sort_values(by='Date')
# reset the index
df = df.reset_index(drop=True)
# save the dataframe to a csv file
#df.to_csv(f'{dataset_name}.csv', index=False)
df


In [6]:
tags_list = []
for file_name in file_name_list:    
    file_path = os.path.join(data_path, file_name)
    soup = get_xml_soup(file_path)

    # Initialize an empty set to hold the tag names
    
    tags = set()

    # Iterate through all elements in the soup object
    for element in soup.find_all(True):
        tags.add(element.name)

    # Convert the set to a list (if desired) and add it to the tags_list
    tags_list.append(tags)
    
    #tags_list = tags_list.append(tags))
    
# Iterate over the list and union each set with the merged_set
merged_set = set()
for s in tags_list:
    merged_set = merged_set.union(s)

# Print the merged set
#print(merged_set)
#print(tags_list)
for tags in merged_set:
    print(tags)
    #print("\n")

Obj
Pagination
CompanyNameAtt
PrintLocation
ZipCode
PubHistory
CoverImageType
LexileScore
PublisherName
EndDate
DateLine
PublisherAddress
StartIssue
ObjectIDs
Copyright
HasGaps
CreatedBy
Address1
Short
ContribPersonName
GenSubjTerm
ColumnHeader
AlphaDate
Contributors
RawLang
CompanyNAIC
MSTARLegacyID
URL
mstar
Abstract
Personal
NumericDate
CompanyName
publisher
CompanyDUNS
GroupFrosting
Medium
Name
Title
EndIssueDate
CoverageRange
DOCID
TextInfo
Term
FlexTerm
PMID
Text
Flags
NormalizedDisplayForm
CompanyDUNSAtt
StartDate
StartVolume
EmbargoDays
ObjectTypes
Qualifier
Languages
ISO
Subject
ObjectID
ContentModel
ISOExpansion
DocEdition
History
Locator
PubFrequencies
LastNameAtt
AlphaStartDate
City
PublisherXID
Geographic
NumericStartDate
MpubId
Subjects
DisplayForm
TitleAtt
Flag
other
SortTitle
Locators
SourceType
ISOCode
CopyrightData
SubTitle
MiddleNameAtt
GOID
MappingVersion
AbsText
OriginalForm
AlphaEndDate
JournalCode
Language
PageCount
SubTitleAtt
NumericEndDate
Author
FirstName
Fir