In [1]:
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import sys
import json

# Add the 'code' directory to sys.path
project_path = Path(
    'c:/Users/97253/OneDrive/Documents/work/bank of israel/financial division/yossi/tdm sentiment/tdm-sentiment/'
)
sys.path.append(str(project_path / 'code'))

# Import custom modules
from is_economic_model.tdm_parser import TdmXmlParser

def read_file_names_in_chunks(input_file, chunk_size):
    with open(f'{input_file}.txt', 'r') as f:
        chunk = []
        for i, line in enumerate(f, 1):
            file_name = line.strip()
            if file_name:
                chunk.append(file_name)
            if i % chunk_size == 0:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

def xml_to_df(file_path):
    parser = TdmXmlParser()
    soup = parser.get_xml_soup(file_path)
    PROPERTY_TAGS = [
        'GOID', 'SortTitle', 'NumericDate', 'StartPage', 'DocSection', 'mstar', 
        'GenSubjTerm', 'is_economic', 'text_blob_sentiment', 'bert_sentiment', 'WordCount'
    ]
    PROPERTY_NAMES = [
        'GOID', 'Publisher', 'Date', 'Page', 'Section', 'Type', 
        'Tags', 'is_economic', 'text_blob_sentiment', 'bert_sentiment', 'WordCount'
    ]
    content_dict = parser.get_xml_to_dict(
        soup, text=False, property_tags=PROPERTY_TAGS, property_names=PROPERTY_NAMES
    )
    df = pd.DataFrame([content_dict])
    return df

# Initialize variables
data_path = project_path / 'data'
chunk_size = 100

file_names_path = data_path / 'economic_dataset_file_names'

results_path = data_path / 'results'
results_path.mkdir(exist_ok=True)

for i, file_chunk in enumerate(read_file_names_in_chunks(file_names_path, chunk_size)):
    chunk_paths = [data_path / 'data_sample' / file_name for file_name in file_chunk]
    
    # Process files in parallel with a progress bar
    with tqdm(total=len(chunk_paths), desc=f"Processing chunk {i+1}") as pbar:
        results = Parallel(n_jobs=-1, backend='threading')(
            delayed(xml_to_df)(path) for path in chunk_paths
        )
        pbar.update(len(chunk_paths))
    
    # Concatenate results
    if results:
        data = pd.concat(results, ignore_index=True)
        output_file = results_path / f'chunk_{i+1}_data.csv'
        data.to_csv(output_file, index=False)
        del data


Processing chunk 1:   0%|          | 0/100 [00:00<?, ?it/s]

Processing chunk 2:   0%|          | 0/100 [00:00<?, ?it/s]

Processing chunk 3:   0%|          | 0/100 [00:00<?, ?it/s]

Processing chunk 4:   0%|          | 0/100 [00:00<?, ?it/s]

Processing chunk 5:   0%|          | 0/25 [00:00<?, ?it/s]

In [2]:
df = pd.read_csv(output_file)
df.head()

Unnamed: 0,WordCount,GOID,Publisher,Date,Page,Section,Type,Tags,is_economic,text_blob_sentiment,bert_sentiment
0,744,1022232503,washington post the,2012-06-27,a.17,editorial-opinion,commentary,political campaigns,1,0.023113,-0.849715
1,623,1442899918,washington post the,2013-10-19,a.8,a-section,news,expansion,1,0.143159,0.112491
2,1617,1678440923,washington post the,2015-05-05,e.1,health,general information,salt,1,0.094614,-0.381124
3,775,1771998110,washington post the,2016-03-11,a.15,editorial - opinion,commentary,politics,1,-0.010533,-0.830004
4,234,1939404332,washington post the,2017-09-17,d.12,sports,news,emmy awards,1,0.207219,0.728199


In [3]:
import os
print(os.getcwd())

c:\Users\97253\OneDrive\Documents\work\bank of israel\financial division\yossi\tdm sentiment\tdm-sentiment\code\results_process


In [5]:
import pandas as pd
sys.path.append(str(project_path / 'code'))
from salience_index.salience_index import SalienceScorer

# Initialize the scorer
scorer = SalienceScorer()


# Define a function to compute salience score
def compute_salience(row):
    try:
        return scorer.get_salience_score(row['Date'], row['Page'], row['WordCount'], row['Publisher'])
    except Exception:
        return None

# Apply the function to each row
df['SalienceScore'] = df.apply(compute_salience, axis=1)

# Save the updated DataFrame
#df.to_csv('data_with_salience_score.csv', index=False)

#print("Salience scores added successfully.")
df

Unnamed: 0,WordCount,GOID,Publisher,Date,Page,Section,Type,Tags,is_economic,text_blob_sentiment,bert_sentiment,SalienceScore
0,744,1022232503,washington post the,2012-06-27,a.17,editorial-opinion,commentary,political campaigns,1,0.023113,-0.849715,0.222
1,623,1442899918,washington post the,2013-10-19,a.8,a-section,news,expansion,1,0.143159,0.112491,0.214
2,1617,1678440923,washington post the,2015-05-05,e.1,health,general information,salt,1,0.094614,-0.381124,0.47
3,775,1771998110,washington post the,2016-03-11,a.15,editorial - opinion,commentary,politics,1,-0.010533,-0.830004,0.23
4,234,1939404332,washington post the,2017-09-17,d.12,sports,news,emmy awards,1,0.207219,0.728199,0.068
5,200,1993706424,washington post the,2018-02-04,e.3,arts,news,critics,1,0.147917,-0.010957,0.07
6,1087,2149891732,washington post the,2018-12-02,a.2,a-section,news,public prosecutors,1,0.074623,-0.318708,0.491
7,752,409725169,washington post the,2004-12-10,t.21,weekend,news,,1,0.071644,0.55939,0.16
8,452,409731132,washington post the,2005-01-30,e.06,sports,news,,1,-0.000986,0.487578,0.126
9,400,409882336,washington post the,2005-11-10,t.26,montgomery extra,news,,1,0.160951,0.603183,0.09
