### helper function

In [1]:
from tdm_parser import TdmXmlParser
from train_model import EconomicClassifier
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import random
import pandas as pd
from pathlib import Path


def concatenate_text(text_list):
    """
    Concatenates a list of strings into a single string separated by spaces.
    
    Args:
        text_list (list): A list of strings.
        
    Returns:
        str: A single concatenated string.
    """
    if isinstance(text_list, list):
        return ' '.join(text_list)
    elif isinstance(text_list, str):
        return text_list  # Already a string
    else:
        return ""  # Return empty string for non-list, non-string entries




def dataset_into_df(dataset_path, xml_file_list, parser, choose_set):
    """
    Reads XML files from a specified directory, extracts relevant data, and converts it into a DataFrame.

    Args:
        dataset_path (str): Path to the directory containing the dataset of articles in XML format.
        xml_file_list (list): A list of XML file names containing the articles.
        parser (object): An object with methods to parse XML into soup and extract data into dictionaries.
        choose_set (set): A set of sections to include in the DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing concatenated data from the specified XML files.
    """
    content_list = []  # Initialize a list to store content dictionaries

    # Iterate over XML files in the provided list
    for file_name in xml_file_list:
        file_path = f'{dataset_path}/{file_name}'  # Construct the full file path
        soup = parser.get_xml_soup(file_path)  # Parse the XML file into soup
        content_dict = parser.get_xml_to_dict(soup)  # Extract content of the file into a dictionary

        # Append the dictionary to the list if it meets the section criteria
        if content_dict and content_dict['Section'] in choose_set:
            content_list.append(content_dict)

    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(content_list)
    # Concatenates a list of strings into a single string
    df['Text'] = df['Text'].apply(concatenate_text)
    return df


def read_file_names_in_chunks(input_file, chunk_size):
    """
    Open a text file and read each row as a file name in chunks.
    
    Args:
        input_file (str): The path to the input text file.
        chunk_size (int): The number of lines (file names) to read per chunk.
        
    Yields:
        list: A list of file names read from the text file, chunk by chunk.
    """
    with open(f'{input_file}.txt', 'r') as f:
        chunk = []
        for i, line in enumerate(f, 1):
            file_name = line.strip()  # Remove any leading/trailing whitespace/newlines
            if file_name:  # Check if line is not empty
                chunk.append(file_name)
            
            if i % chunk_size == 0:
                yield chunk
                chunk = []  # Reset chunk list for the next batch

        # Yield any remaining lines in the last chunk
        if chunk:
            yield chunk


def save_file_names_to_txt(file_names, output_file):
    """
    Save a list of file names into a text file, appending to the existing content if it exists.
    
    Args:
        file_names (list): A list of file names to save.
        output_file (str): The path to the output text file.
    """
    with open(f'{output_file}.txt', 'a') as f:  
        for file_name in file_names:
            f.write(file_name + '\n')
 

positive_set = {'money', 'business', 'finance/business', 'business; part b; business desk', 'financial'}
negative_set = {'outlook', 'arts & entertainment', 'style', 'movies', 'arts'}#'sports', 
choose_set = positive_set | negative_set

#dataset_name_list = 
#data_path = 
dataset_name_list = ['TheNewYorkTimes_sample20', 'USAToday_sample20', 'LosAngelesTimes_sample20', 'TheWashingtonPost_sample25']
project_path = Path('c:/Users/97253/OneDrive/Documents/work/bank of israel/financial division/yossi/tdm sentiment/tdm-sentiment/') #TODO
input_data_path = project_path / 'data/'
output_data_path = project_path / 'data/'

### get all data file names modified according to their data set 

In [2]:
# create all_dataset_file_names txt file 
dataset_file_names_list = []

for dataset in dataset_name_list:
    dataset_path = input_data_path / 'data_sample' / dataset
    # Get all files in the directory
    xml_file_list = [file.name for file in dataset_path.iterdir() if file.is_file()]
    for file in xml_file_list:
        dataset_file_names_list.append(f'{dataset}/{file}')
    
path = output_data_path / 'all_dataset_file_names.txt'
if not path.exists():
    save_file_names_to_txt(dataset_file_names_list, output_data_path / 'all_dataset_file_names')

dataset_file_names_list
del dataset_file_names_list

### get data into a data set 

In [3]:
parser = TdmXmlParser()
data_chunks = []
# Process each chunk of file names
for file_chunk in read_file_names_in_chunks(output_data_path / 'all_dataset_file_names', chunk_size=10**5):
    # Sample a maximum of 10,000 files from each chunk or the total number in the chunk if fewer
    file_chunk = random.sample(file_chunk, min(10**4, len(file_chunk)))

    # Initialize a progress bar
    with tqdm(total=len(file_chunk)) as pbar:
        # Process the files in the current chunk
        df = dataset_into_df(input_data_path / 'data_sample/', file_chunk, parser, choose_set)  #TODO
        data_chunks.append(df)  # Append the resulting DataFrame to the list

        # Update the progress bar after each chunk is processed
        pbar.update(len(file_chunk))

# Concatenate all DataFrame chunks into one DataFrame after the loop
data = pd.concat(data_chunks, ignore_index=True)

# Display the first row of the collected DataFrame
print(data.head(1))

  0%|          | 0/85 [00:00<?, ?it/s]

        GOID            Publisher                                 Title  \
0  894525606  washington post the  it's not your typical in-law problem   

         Date Language Page Section                 Type             Edition  \
0  2011-09-29  english  c.3   style  general information  final - every zone   

   Tags Company Name Personal Author Last Name Author First Name Lexile Score  \
0  None         None     None              hax           carolyn         1260   

                                                Text  
0  adapted from a recent online discussion. hi ca...  


### build model

In [4]:
# initialize model
classifier = EconomicClassifier(initialize=True)
X_test, y_test = classifier.train_classifier(data)
classifier.save_model()

### evaluate model

In [5]:
evaluation = classifier.evaluate_model(X_test, y_test)
print(evaluation)

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### use model

In [6]:
# Initialize the classifier with the loaded model and vectorizer
classifier = EconomicClassifier()
article = "economy is expected to grow by 3% this year."
result = classifier.is_economic(article)
print(result)

0
