In [3]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from bs4 import BeautifulSoup

In [5]:
load_dotenv()
NEWS_API_KEY = os.getenv('NEWS_API')

NEWS_API_URL = 'https://newsapi.org/v2/everything'

In [None]:

def fetch_news():
    params = {
        'q': 'IBM',
        'from': '2024-11-20',
        'to': '2024-11-27',
        'sortBy': 'relevancy',
        'apiKey': NEWS_API_KEY,
    }
    response = requests.get(NEWS_API_URL, params=params)
    if response.status_code == 200:
        return response.json().get('articles', [])
    else:
        print(f"Error fetching news: {response.status_code}")
        return []

articles = fetch_news()

if articles:
    for article in articles:
        title = article.get('title', 'No Title')
        description = article.get('description', 'No Description')
        content = f"{title}. {description}"
else:
    print("No articles found.")

In [69]:
articles

[{'source': {'id': None, 'name': 'Ibm.com'},
  'author': None,
  'title': 'IBM Quantum delivers on 2022 100x100 performance challenge',
  'description': 'At IBM Quantum Developer Conference 2024, IBM is enabling algorithm discovery with high-performance quantum computers and easy-to-use quantum software.',
  'url': 'https://www.ibm.com/quantum/blog/qdc-2024',
  'urlToImage': 'https://research-website-prod-cms-uploads.s3.us.cloud-object-storage.appdomain.cloud/QDC_Character_Loop_16x9_1920_f0ee4d9893.gif',
  'publishedAt': '2024-11-22T09:59:23Z',
  'content': 'Two years ago, IBM® set an ambitious challenge for the quantum computing community: Develop quantum algorithms incorporating circuits with 100 qubits and gate depths of 100, while IBM would build a q… [+11633 chars]'},
 {'source': {'id': None, 'name': 'Theregister.com'},
  'author': 'Thomas Claburn',
  'title': "Kyndryl insiders say there's little new business",
  'description': 'IT giant aims to end revenue slide by March, former 

In [7]:
len(articles)

100

In [9]:
data = []
for article in articles:
    data.append({
        "Date": article["publishedAt"].split("T")[0],
        "Title": article["title"],
        "Description": article["description"],
        "Url": article["url"],
    })

In [10]:
articles_df = pd.DataFrame(data)

In [11]:
import requests
from bs4 import BeautifulSoup
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

descriptions = []

for u in articles_df['Url']:
    try:
        r = requests.get(u, headers=headers, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
        full_description = " ".join(p.get_text(strip=True) for p in soup.find_all('p'))
        descriptions.append(full_description)
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error for {u}: {e}")
        descriptions.append("")
    except requests.exceptions.RequestException as e:
        print(f"Failed to scrape {u}: {e}")
        descriptions.append("")
    time.sleep(2)  # Avoid overloading the server

articles_df['Full_Description'] = descriptions



HTTP error for https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_5aee54cb-7afe-4561-b2bb-8e5e7a0778d7: 401 Client Error: Unauthorized for url: https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_5aee54cb-7afe-4561-b2bb-8e5e7a0778d7


In [12]:
len(descriptions)

100

In [108]:
articles_df.head()

Unnamed: 0,Date,Title,Description,Url,Full_Description
0,2024-11-20,Researchers Claim to Pinpoint Exact Date When ...,New research suggests that there was a very sp...,https://gizmodo.com/researchers-claim-to-pinpo...,"For a while now, researchers have been trying ..."
1,2024-11-20,La incertidumbre que ha desencadenado la inmin...,Las declaraciones que ha hecho Donald Trump ac...,https://www.xataka.com/empresas-y-economia/inc...,Juan Carlos López Las declaraciones que ha hec...
2,2024-11-20,Microsoft Flight Simulator 2024 is an unplayab...,Launch day bugs and relentless crashes join th...,https://www.windowscentral.com/gaming/microsof...,"When you purchase through links on our site, w..."
3,2024-11-20,Lo despiden de Amazon y acaba teletrabajando a...,"Al comienzo de la pandemia, Bryan Roque perdió...",https://www.genbeta.com/a-fondo/despiden-amazo...,"Bárbara Bécares Al comienzo de la pandemia,Bry..."
4,2024-11-20,EEUU prepara un nuevo 'proyecto Manhattan'. Es...,EEUU y China se han lanzado a una carrera para...,https://www.xataka.com/robotica-e-ia/eeuu-prep...,Juan Carlos López EEUU y China se han lanzado ...


In [106]:
articles_df = articles_df.sort_values("Date", ascending=True)
articles_df = articles_df.reset_index(drop=True)

In [107]:
articles_df.tail()

Unnamed: 0,Date,Title,Description,Url,Full_Description
95,2024-11-27,超混「幽靈工程師」正啃食科技巨頭！全球每年浪費 900 億美元,研究分析超過 5 萬名工程師程式碼，發現「幽靈工程師」現象。,https://www.inside.com.tw/article/36890-,史丹佛大學一項最新研究指出，全球軟體產業存在著大量的「幽靈工程師」，這些工程師幾乎沒有任何實...
96,2024-11-27,Weiter Weg zur Open-Source-KI: Red Hat prescht...,Protagonisten von Open Source wollen eine leis...,https://www.heise.de/news/Weiter-Weg-zur-Open-...,Protagonisten von Open Source wollen eine le...
97,2024-11-27,L’incredibile storia dei sorgenti di IBM CP/67...,"Negli anni '60, IBM rivoluzionò l'informatica ...",https://www.ilsoftware.it/lincredibile-storia-...,Hardware Applicativi Sicurezza Reti Cloud OS S...
98,2024-11-27,EYストラテジー・アンド・コンサルティングと日本IBM、日本企業のDXの加速を目指し協業を強化,[日本IBM]\n- AIエージェント製品の「IBM watsonx Orchestrate...,https://prtimes.jp/main/html/rd/p/000000579.00...,PR TIMESのご利用について 資料をダウンロード 日本IBM AIエージェント製品の「I...
99,2024-11-27,～万博が未来をみちびく～企画展「万博で夢見たサイエンス展」を開催します！前期：12月６日(金...,[大阪市博物館機構]\n【会期中の関連イベント】\n\n■全国巡回展示「イトカワの石、リュウ...,https://prtimes.jp/main/html/rd/p/000000065.00...,PR TIMESのご利用について 資料をダウンロード 大阪市博物館機構 【会期中の関連イベン...


In [53]:
text = articles_df['Full_Description'][0]

In [54]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [55]:
tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    """Measures the number of tokens in a text."""
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [56]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        
    chunk_overlap=0,       
    length_function=tiktoken_len,  
    separators=["\n\n", "\n", " ", ""]
)

In [57]:
def split_into_chunks(text, chunk_size=1000):
    """Splits text into chunks using RecursiveCharacterTextSplitter."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=0,
        length_function=tiktoken_len,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_text(text)

In [117]:
USER_PROMPT_TEMPLATE = """
I'm analyzing the IBM stock market by identifying trends from articles and news. For each article, I'll provide the title and document, and I need you to analyze the sentiment and provide a score between 0 (negative) and 1 (positive). 

Your output must always be a valid JSON string, strictly adhering to the format below:

{{
  "sentiment_score": float
}}

Here’s the format I will provide you:

{{
  "Title": "{title}",
  "Document": "{document}"
}}
"""



In [118]:
# Chain of Thought Prompt
system_prompt = """
To analyze trends in stock market articles, the following steps will be taken:

Article Input: The user will provide the title and content (document) of the article.

Sentiment Analysis: The model will analyze the sentiment of the article based on the tone of the text, identifying whether it conveys a positive, neutral, or negative sentiment towards the market or a specific stock.

Impact Prediction: The model will assess the potential future impact of the article on stock prices. This includes identifying market trends, factors influencing stock performance (such as company performance, market conditions, or industry news), and making a prediction about whether the stock price will increase, decrease, or remain stable.

Response Format: The model will output the sentiment and predicted impact in the specified JSON format.

Iterative Process: For every new article, the same process will be applied to extract the sentiment and predict the impact on stock prices.

Expected JSON format:
{{
  "sentiment_score": float
}}
"""

In [112]:
from openai import OpenAI

In [113]:
OPENAI_KEY = os.getenv("OPENAI_KEY")
client = OpenAI(
    api_key=OPENAI_KEY, 
)

In [None]:

def analyze_with_gpt(title, document):

    chunks = split_into_chunks(document, chunk_size=1000)
    sentiment_scores = []
    
    for i, chunk in enumerate(chunks):
        user_prompt = USER_PROMPT_TEMPLATE.format(title=title, document=chunk)
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",  
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                response_format={"type": "json_object"}
            )
            result = response.choices[0].message.content
            result_dict = json.loads(result)
            sentiment_score = float(result_dict.get("sentiment_score", 0))
            sentiment_scores.append(sentiment_score)
        except json.JSONDecodeError as json_error:
            print(f"JSON decode error on chunk {i + 1}: {json_error}")
            sentiment_scores.append(0)  
        except Exception as e:
            print(f"Error with OpenAI API on chunk {i + 1}: {e}")
            sentiment_scores.append(0)  
    
    return sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0

In [121]:
from tqdm import tqdm

def process_dataframe(df):

    sentiment_scores = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        title = row["Title"]
        full_description = row["Full_Description"]
        avg_score = analyze_with_gpt(title, full_description)
        sentiment_scores.append(avg_score)
    
    df["Sentiment_Score"] = sentiment_scores
    return df

articles_df_cp = process_dataframe(articles_df)

Processing articles: 100%|██████████| 100/100 [07:27<00:00,  4.47s/it]


In [122]:
articles_df_cp

Unnamed: 0,Date,Title,Description,Url,Full_Description,Sentiment_Score
0,2024-11-20,Researchers Claim to Pinpoint Exact Date When ...,New research suggests that there was a very sp...,https://gizmodo.com/researchers-claim-to-pinpo...,"For a while now, researchers have been trying ...",0.400000
1,2024-11-20,La incertidumbre que ha desencadenado la inmin...,Las declaraciones que ha hecho Donald Trump ac...,https://www.xataka.com/empresas-y-economia/inc...,Juan Carlos López Las declaraciones que ha hec...,0.300000
2,2024-11-20,Microsoft Flight Simulator 2024 is an unplayab...,Launch day bugs and relentless crashes join th...,https://www.windowscentral.com/gaming/microsof...,"When you purchase through links on our site, w...",0.050000
3,2024-11-20,Lo despiden de Amazon y acaba teletrabajando a...,"Al comienzo de la pandemia, Bryan Roque perdió...",https://www.genbeta.com/a-fondo/despiden-amazo...,"Bárbara Bécares Al comienzo de la pandemia,Bry...",0.500000
4,2024-11-20,EEUU prepara un nuevo 'proyecto Manhattan'. Es...,EEUU y China se han lanzado a una carrera para...,https://www.xataka.com/robotica-e-ia/eeuu-prep...,Juan Carlos López EEUU y China se han lanzado ...,0.400000
...,...,...,...,...,...,...
95,2024-11-27,超混「幽靈工程師」正啃食科技巨頭！全球每年浪費 900 億美元,研究分析超過 5 萬名工程師程式碼，發現「幽靈工程師」現象。,https://www.inside.com.tw/article/36890-,史丹佛大學一項最新研究指出，全球軟體產業存在著大量的「幽靈工程師」，這些工程師幾乎沒有任何實...,0.200000
96,2024-11-27,Weiter Weg zur Open-Source-KI: Red Hat prescht...,Protagonisten von Open Source wollen eine leis...,https://www.heise.de/news/Weiter-Weg-zur-Open-...,Protagonisten von Open Source wollen eine le...,0.475000
97,2024-11-27,L’incredibile storia dei sorgenti di IBM CP/67...,"Negli anni '60, IBM rivoluzionò l'informatica ...",https://www.ilsoftware.it/lincredibile-storia-...,Hardware Applicativi Sicurezza Reti Cloud OS S...,0.550000
98,2024-11-27,EYストラテジー・アンド・コンサルティングと日本IBM、日本企業のDXの加速を目指し協業を強化,[日本IBM]\n- AIエージェント製品の「IBM watsonx Orchestrate...,https://prtimes.jp/main/html/rd/p/000000579.00...,PR TIMESのご利用について 資料をダウンロード 日本IBM AIエージェント製品の「I...,0.833333


In [None]:
articles_df['Sentiment_Category'] = articles_df['Sentiment_Score'].apply(
    lambda x: 'positive' if x >= 0.5 else 'negative'
)

sentiment_counts = (
    articles_df.groupby(['Date', 'Sentiment_Category'])
    .size()  # Count occurrences
    .unstack(fill_value=0)  # Pivot categories into columns
    .reset_index()  # Reset index to make it a DataFrame
)

sentiment_counts.columns = ['Date', 'Negative_Articles', 'Positive_Articles']

sentiment_summary_df = pd.DataFrame(sentiment_counts)


In [124]:
sentiment_summary_df

Unnamed: 0,Date,Negative_Articles,Positive_Articles
0,2024-11-20,13,11
1,2024-11-21,8,12
2,2024-11-22,4,6
3,2024-11-23,0,4
4,2024-11-24,0,4
5,2024-11-25,3,5
6,2024-11-26,5,9
7,2024-11-27,9,7
