In [1]:
from tdm_parser.tdm_parser import TdmXmlParser
from sentiment_model.sentiment_score import TextAnalysis
from pathlib import Path
import pandas as pd
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import logging



# Configure logging at the start of your script or notebook
logging.basicConfig(
    level=logging.INFO,          # Set logging level to INFO or DEBUG as needed
    format='%(asctime)s %(name)s %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    force=True
    
)
logger = logging.getLogger(__name__)


#dataset_name_list = ['Newyork20042023']#, 'LosAngelesTimesDavid', 'TheWashingtonPostDavid','ChicagoTribune', 'USATodayDavid']
project_path = Path('/home/ec2-user/SageMaker/david/tdm-sentiment/')
data_path = project_path / 'data/'
corpus_path = Path('/home/ec2-user/SageMaker/data/')#Newyork20042023_realistic_economy_articles
corpus_name = 'TheWashingtonPostDavid'  # 'Newyork20042023_realistic_economy_articles'  'ChicagoTribune_realistic_economy_articles'
file_names_path = project_path / 'data/file_names' / corpus_name 

model_name = 'sentiment_model/distilbert-base-uncased-finetuned-sst-2-english'
model_path = project_path / 'code' / model_name

In [3]:
def modify_sentiment(file_path, parser, analyzer):
    """
    Modify an XML file by adding sentiment tags.

    Args:
        file_path (str): The path to the XML file to modify.
        parser (object): An object that can parse and manipulate XML (should have `get_xml_soup` and `modify_tag` methods).
        analyzer (object): An object that can analyze sentiment (should have `analyze_article_sentiment` method).

    Returns:
        None or any optional value: Returns None if sentiment tag is already set or in case of error.
    """
    try:
        # Parse the XML
        soup = parser.get_xml_soup(file_path)
        texts = parser.get_art_text(soup)

        # Check if 'bert_sentiment' tag already exists and skip if appropriate
        bert_sentiment = soup.find('bert_sentiment')
        if bert_sentiment:
            current_value = float(bert_sentiment.text)
            if current_value != 0:
                #logger.info(f"File '{file_path}' already has a non-zero bert_sentiment ({current_value}). Skipping.")
                soup = None
                return None
            else:
                logger.info(f"File '{file_path}' has a bert_sentiment tag of zero. Will update.")

        # Use the pre-loaded analyzer to get sentiment
        value = analyzer.analyze_article_sentiment(texts, method='bert')
        logger.info(f"Calculated sentiment for '{file_path}' is {value}.")

        # Modify the XML with the new sentiment tag
        soup = parser.modify_tag(soup, value=value, tag_name='bert_sentiment')

        # Write the modified XML back to the file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(str(soup))
        logger.info(f"Successfully wrote updated sentiment to '{file_path}'.")

        # Clean up
        soup = None
        return None

    except Exception as e:
        # Logs the exception with traceback
        logger.exception(f"An error occurred while processing '{file_path}': {e}")
        return None


def read_file_names_in_chunks(input_file, chunk_size):
    with open(f'{input_file}.txt', 'r') as f:
        chunk = []
        for i, line in enumerate(f, 1):
            file_name = line.strip()
            if file_name:
                chunk.append(file_name)
            if i % chunk_size == 0:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

### modify sentiment

In [None]:
# Initialize parser and analyzer once
parser = TdmXmlParser()
analyzer = TextAnalysis(model_path)  # Preload the model only once here


chunk_size = 2000
counter = 0
for i, file_chunk in enumerate(read_file_names_in_chunks(file_names_path, chunk_size)):
    counter += 1
    if counter < 126:
        continue
    print("Processing chunk:", counter)
    file_paths = [corpus_path / path for path in file_chunk]
    inner_counter = 0
    for i in range(0, len(file_paths), 20):
        inner_counter += 1
        #if inner_counter < 98: #TODO
            #continue #TODO
        chunk_path = file_paths[i: i+20]
    
        with tqdm(total=len(chunk_path)) as pbar:
            Parallel(n_jobs=-1, backend='loky')(
                delayed(modify_sentiment)(path, parser, analyzer) for path in chunk_path
            )
            pbar.update(len(chunk_path))
#['loky', 'multiprocessing', 'sequential', 'threading']

  state_dict = torch.load(resolved_archive_file, map_location="cpu")
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


BERT model loaded successfully from '/home/ec2-user/SageMaker/david/tdm-sentiment/code/sentiment_model/distilbert-base-uncased-finetuned-sst-2-english' on device -1.
Processing chunk: 126


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (736 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]