In [1]:
import json
import time
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import pandas as pd
import refinitiv.data as rd
from refinitiv.data.content import symbol_conversion, search
from datetime import timedelta
from IPython.display import HTML

## Step 0: Open Refinitiv API session

In [2]:
rd.open_session()

<refinitiv.data.session.Definition object at 0x15c649e10 {name='workspace'}>

## Step 1: Retrieve data - Get headlines

In [92]:
headline_df = rd.news.get_headlines("2330.TW", 
                                    start="15.03.2025", 
                                    end=timedelta(days=-1500),
                                    count=3000)


print(headline_df.to_string())

                                                                                                                                                                                                                                                                                                                                             headline                                        storyId sourceCode
versionCreated                                                                                                                                                                                                                                                                                                                                                                                                 
2025-03-14 23:54:10.047                                                                                                                                                                                 

## Step 2: Retrieve data - Automize storyID extraction  

In [93]:
# Create a list of tuples (headline, storyId)
id_lst = list(zip(headline_df["headline"], headline_df["storyId"]))

# Filter tuples where storyId starts with "urn:newsml:newsroom:"
filtered_id_lst = [(headline, storyId) for headline, storyId in id_lst if storyId.startswith("urn:newsml:newsroom:")]

# Print the filtered list
print(filtered_id_lst)

[('Patent Issued for Package structure (USPTO 12237276)', 'urn:newsml:newsroom:20250314:nNRAvq5at7:0'), ('Taiwan Semiconductor Manufacturing Company Limited (NYSE:TSM) Shares Acquired by Wellington Shields Capital Management LLC', 'urn:newsml:newsroom:20250314:nNRAvq4tag:0'), ("Taiwan Semiconductor Manufacturing (NYSE:TSM) Stock Price Up 2.6% – What's Next?", 'urn:newsml:newsroom:20250314:nNRAvq4snx:0'), ("Taiwan Semiconductor Manufacturing (NYSE:TSM) Stock Price Up 0.8% – What's Next?", 'urn:newsml:newsroom:20250314:nNRAvq4u39:0'), ('Patent Issued for Cavity resonator for enhancing radio-frequency performance and methods for forming the same (USPTO 12237280)', 'urn:newsml:newsroom:20250314:nNRAvq4hid:0'), ('Patent Issued for Method of forming semiconductor device (USPTO 12237400)', 'urn:newsml:newsroom:20250314:nNRAvq48p0:0'), ("Friday's Preview: Can Taiwan Semiconductor Manufacturing rebound after being down?", 'urn:newsml:newsroom:20250314:nNRAvq3yi3:0'), ('First Eagle Investment Ma

## Step 3: Retrieve data - Automize news content raw HTML extraction 

In [94]:
news_dic = {}

for headline, storyId in filtered_id_lst:  # Ensure we use the correct loop variable
    try:
        # Fetch the news story
        text = rd.news.get_story(storyId, format=rd.news.Format.TEXT)

        if text:
            #print(text)  # Debugging output
            news_dic[headline] = text  # Store the story text in the dictionary
        else:
            print(f"Failed to fetch story for ID: {story_id}")

    except TypeError as e:
        print(f"TypeError for story ID {story_id}: {e}")
    except Exception as e:
        print(f"Unexpected error for story ID {story_id}: {e}")

# Optional: Print the dictionary to verify
print(news_dic)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [95]:
# Convert dictionary to DataFrame
news_df = pd.DataFrame(list(news_dic.items()), columns=['Headline', 'Story'])

# Display the DataFrame
news_df

Unnamed: 0,Headline,Story
0,Patent Issued for Package structure (USPTO 122...,Patent Issued for Package structure (USPTO 122...
1,Taiwan Semiconductor Manufacturing Company Lim...,Taiwan Semiconductor Manufacturing Company Lim...
2,Taiwan Semiconductor Manufacturing (NYSE:TSM) ...,Taiwan Semiconductor Manufacturing (NYSE:TSM) ...
3,Taiwan Semiconductor Manufacturing (NYSE:TSM) ...,Taiwan Semiconductor Manufacturing (NYSE:TSM) ...
4,Patent Issued for Cavity resonator for enhanci...,Patent Issued for Cavity resonator for enhanci...
...,...,...
556,"Trump: China's invasion of Taiwan would be ""ca...","Trump: China's invasion of Taiwan would be ""ca..."
557,TSMC To Invest $100 Billion To Expand Chip Man...,TSMC To Invest $100 Billion To Expand Chip Man...
558,Trump announces $100 billion investment in U.S...,Trump announces $100 billion investment in U.S...
559,TSMC (TSM) Pre-Market Gains on $100 Billion U....,TSMC (TSM) Pre-Market Gains on $100 Billion U....


In [96]:
news_df.to_csv("news_data_large.csv", index=False)

In [97]:
news_df.to_excel("news_data_large.xlsx", index=False)

In [98]:
news_df.to_json("news_data_large.json", orient="records", indent=4)

In [None]:
rd.close_session()