In [42]:
!pip install feedparser



# Theory: HKSE is easily swayed with China and US news

# Data Gathering

## News Data: SCMP RSS Feed

In [43]:
import feedparser
import pandas as pd

In [44]:
def fetch_articles_from_rss(urls):
    articles = []

    for url in urls:
        feed = feedparser.parse(url)
        for entry in feed.entries:
            articles.append({
                'title': entry.title,
                'published': entry.published,
                'link': entry.link,
                'summary': entry.summary
            })

    return pd.DataFrame(articles)

In [45]:
hk_url = ['https://www.scmp.com/rss/318206/feed','https://www.scmp.com/rss/318210/feed','https://www.scmp.com/rss/318208/feed','https://www.scmp.com/rss/318217/feed','https://www.scmp.com/rss/318207/feed']
china_url = ['https://www.scmp.com/rss/318198/feed','https://www.scmp.com/rss/318199/feed','https://www.scmp.com/rss/318200/feed','https://www.scmp.com/rss/318421/feed','https://www.scmp.com/rss/318202/feed']
business_url = ['https://www.scmp.com/rss/92/feed','https://www.scmp.com/rss/10/feed','https://www.scmp.com/rss/96/feed','https://www.scmp.com/rss/7/feed','https://www.scmp.com/rss/12/feed','https://www.scmp.com/rss/318421/feed','https://www.scmp.com/rss/318200/feed']
world_url =['https://www.scmp.com/rss/322262/feed','https://www.scmp.com/rss/3/feed','https://www.scmp.com/rss/5/feed','https://www.scmp.com/rss/322262/feed']

In [46]:
# Fetch news articles for each category
hk_df = fetch_articles_from_rss(hk_url)
china_df = fetch_articles_from_rss(china_url)
business_df = fetch_articles_from_rss(business_url)
world_df = fetch_articles_from_rss(world_url)

# Concatenate the DataFrames for all categories
df = pd.concat([hk_df, china_df, business_df, world_df], ignore_index=True)

In [47]:
# Drop duplicates based on the 'title' column
df_no_duplicates = df.drop_duplicates(subset=['title'])

In [48]:
data = df_no_duplicates.copy()

In [49]:
# Convert 'published' column to datetime format
data['published'] = pd.to_datetime(data['published'])

In [50]:
# Sort the DataFrame by the 'published' column
df_sorted = data.sort_values(by='published')

In [51]:
# Extract the first unique value after 'www.scmp.com/' from each link
df_sorted['first_unique_value'] = df_sorted['link'].str.extract(r'www\.scmp\.com/(\w+)')

In [52]:
df_sorted['first_unique_value'].unique()

array(['sport', 'lifestyle', 'business', 'comment', 'native', 'news',
       'presented', 'tech', 'economy', 'property', 'opinion', 'week',
       'magazines'], dtype=object)

In [53]:
# Define the values to retain
retain_values = ['business', 'comment', 'native', 'news', 'presented', 'tech', 'economy', 'property', 'opinion', 'week']

# Filter the DataFrame to retain only the specified values
df_filtered = df_sorted[df_sorted['first_unique_value'].isin(retain_values)]

In [54]:
# Get the maximum and minimum dates
max_date = df_filtered['published'].max()
min_date = df_filtered['published'].min()

# Display the maximum and minimum dates
print("Maximum Date:", max_date)
print("Minimum Date:", min_date)

Maximum Date: 2024-05-01 10:30:16+08:00
Minimum Date: 2023-09-07 10:26:22+08:00


In [55]:
df_filtered.to_csv('scmp_final.csv')

## Stock Data: Hong Kong Stock Exchange Yahoo Finance!

In [56]:
!pip install yfinance



In [57]:
import yfinance as yf
import pandas as pd

In [58]:
# Define the ticker symbol for Hang Seng Index (HSI)
ticker_symbol = '^HSI'

# Define the start and end dates for the desired date range
start_date = '2023-09-01'
end_date = '2024-04-30'

# Fetch historical data for HSI from Yahoo Finance
hsi_data = yf.download(ticker_symbol, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


In [59]:
hsi_data.to_csv('hsi.csv')

In [60]:
hsi_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-09-04,18592.529297,18899.490234,18575.449219,18844.160156,18844.160156,4040296000
2023-09-05,18717.419922,18725.5,18431.720703,18456.910156,18456.910156,2315663900
2023-09-06,18405.089844,18492.589844,18256.099609,18449.980469,18449.980469,2352887700
2023-09-07,18437.480469,18437.480469,18173.439453,18202.070312,18202.070312,2123306900
2023-09-11,17941.650391,18164.609375,17842.369141,18096.449219,18096.449219,2522422900


# Data Cleaning

In [62]:
file_path = "/kaggle/working/scmp_final.csv"

df_filtered = pd.read_csv(file_path)

In [63]:
file_path = "/kaggle/working/hsi.csv"

hsi_data = pd.read_csv(file_path)

In [64]:
hsi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       160 non-null    object 
 1   Open       160 non-null    float64
 2   High       160 non-null    float64
 3   Low        160 non-null    float64
 4   Close      160 non-null    float64
 5   Adj Close  160 non-null    float64
 6   Volume     160 non-null    int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 8.9+ KB


In [65]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          688 non-null    int64 
 1   title               688 non-null    object
 2   published           688 non-null    object
 3   link                688 non-null    object
 4   summary             687 non-null    object
 5   first_unique_value  688 non-null    object
dtypes: int64(1), object(5)
memory usage: 32.4+ KB


In [66]:
news = df_filtered[['title','published','summary']].copy()

In [67]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      688 non-null    object
 1   published  688 non-null    object
 2   summary    687 non-null    object
dtypes: object(3)
memory usage: 16.2+ KB


In [68]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [69]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [70]:
import spacy
# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

In [71]:
# Function to preprocess text using SpaCy
def preprocess_text_spacy(text):
    # Tokenize the text
    doc = nlp(text)
    # Lemmatize words and remove stopwords and punctuation
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [72]:
# Clean the 'title' column using SpaCy
news['clean_title'] = news['title'].apply(preprocess_text_spacy)


In [73]:
type(news['title'][0])

str

In [74]:
def preprocess_text_spacy(text):
    # Check if the input is a valid string
    if not isinstance(text, str):
        return None

    # Tokenize the text
    doc = nlp(text)

    # Lemmatize words and remove stopwords and punctuation
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    return " ".join(tokens)


In [75]:
def clean_news(news):
    # Drop or replace NaN and non-string entries in 'summary'
    news['summary'] = news['summary'].apply(lambda x: "" if pd.isna(x) else str(x))

    # Apply the preprocess_text_spacy function to clean text
    news['clean_summary'] = news['summary'].apply(preprocess_text_spacy)

    return news

In [76]:
news['summary'] = news['summary'].fillna("")  # Replace NaN with an empty string
news['clean_summary'] = news['summary'].apply(preprocess_text_spacy)

In [77]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          688 non-null    object
 1   published      688 non-null    object
 2   summary        688 non-null    object
 3   clean_title    688 non-null    object
 4   clean_summary  688 non-null    object
dtypes: object(5)
memory usage: 27.0+ KB


# LLM Sentiment Analysis

Reference: https://www.kaggle.com/code/lucamassaron/fine-tune-gemma-7b-it-for-sentiment-analysis

In [78]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip install -q -U transformers=="4.38.2"
!pip install -q accelerate
!pip install -q -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U datasets
!pip install -q -U git+https://github.com/huggingface/trl
!pip install -q -U git+https://github.com/huggingface/peft

In [79]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [80]:
import warnings
warnings.filterwarnings("ignore")

In [81]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from datasets import Dataset
from peft import LoraConfig, PeftConfig
import bitsandbytes as bnb
from trl import SFTTrainer

from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

2024-05-01 02:37:43.692416: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 02:37:43.692534: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 02:37:43.815912: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [82]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/gemma/transformers/7b-it")

print("Path to model files:", path)

Attaching model 'google/gemma/transformers/7b-it' to your Kaggle notebook...


Path to model files: /kaggle/input/gemma/transformers/7b-it/3


In [83]:

model_name = "/kaggle/input/gemma/transformers/7b-it/3"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 2048
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [84]:
import transformers
from transformers import XLNetConfig, XLNetModel, XLNetTokenizer,XLNetForSequenceClassification
from transformers import AlbertConfig, AlbertModel, AlbertTokenizer
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer
from transformers import AdamW, AdamWeightDecay, get_linear_schedule_with_warmup

In [85]:
'''from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Initialize the pipeline with the financial sentiment model
pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
'''

'from transformers import AutoTokenizer, AutoModelForSequenceClassification\n\nmodel_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"\n\ntokenizer = AutoTokenizer.from_pretrained(model_name)\nmodel = AutoModelForSequenceClassification.from_pretrained(model_name)\n\n# Initialize the pipeline with the financial sentiment model\npipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")\n'

In [86]:
'''# Example usage
text = "Tech stocks rise as market stabilizes."
sentiment = pipe(text)

print(f"Text: {text}")
print(f"Sentiment: {sentiment}")'''

'# Example usage\ntext = "Tech stocks rise as market stabilizes."\nsentiment = pipe(text)\n\nprint(f"Text: {text}")\nprint(f"Sentiment: {sentiment}")'

In [87]:
def generate_test_prompt_title(data_point):
    return f"""
            You are an experienced financial advisor based in Hong Kong. With the recent 
            tension between Hong Kong and China, the Hong Kong Stock Exchange is susceptible 
            to volatility due to news headlines. Identify if the news title enclosed in square brackets will 
            have a positive or negative impact on the Hong Kong Stock Exchange Index. Give the corresponding labels, 
            "positive" or "neutral" or "negative"

            [{data_point["clean_title"]}] = 

            """.strip()

def generate_test_prompt_summary(data_point):
    return f"""
            You are an experienced financial advisor based in Hong Kong. With the recent 
            tension between Hong Kong and China, the Hong Kong Stock Exchange is susceptible 
            to volatility due to news headlines. Identify if the news title enclosed in square brackets will 
            have a positive or negative impact on the Hong Kong Stock Exchange Index. Give the corresponding labels, 
            "positive" or "neutral" or "negative"

            [{data_point["clean_summary"]}] = 

            """.strip()

In [88]:
news['title_prompt'] = news.apply(lambda row: generate_test_prompt_title({"clean_title": row['clean_title']}), axis=1)
news['summary_prompt'] = news.apply(lambda row: generate_test_prompt_summary({"clean_summary": row['clean_summary']}), axis=1)

In [89]:
news.head()

Unnamed: 0,title,published,summary,clean_title,clean_summary,title_prompt,summary_prompt
0,"SMIC, Tencent, JD.com lead Hong Kong stock los...",2023-09-07 10:26:22+08:00,The Hang Seng Index weakened for a third day a...,smic tencent jd.com lead hong kong stock loss ...,hang seng index weaken day month weak china tr...,You are an experienced financial advisor based...,You are an experienced financial advisor based...
1,Cheap and cheaper: stock buy-backs approaching...,2023-09-19 19:37:59+08:00,Hong Kong-listed companies have spent a combin...,cheap cheap stock buy back approach us$ 10 bil...,hong kong list company spend combine hk$178 bi...,You are an experienced financial advisor based...,You are an experienced financial advisor based...
2,Hong Kong’s IPO environment toughest in over a...,2023-09-22 18:15:24+08:00,Hong Kong’s public fundraising environment is ...,hong kong ipo environment toughest decade rise...,hong kong public fundraising environment tough...,You are an experienced financial advisor based...,You are an experienced financial advisor based...
3,Shanghai to hasten expansion of deep water por...,2023-09-22 21:51:23+08:00,The expansion of deepwater terminals at Yangsh...,shanghai hasten expansion deep water port yang...,expansion deepwater terminal yangshan island g...,You are an experienced financial advisor based...,You are an experienced financial advisor based...
4,China Evergrande applies for trading to resume...,2023-10-02 22:12:09+08:00,China Evergrande Group has applied for trading...,china evergrande apply trading resume hong kon...,china evergrande group apply trading share res...,You are an experienced financial advisor based...,You are an experienced financial advisor based...


In [90]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           688 non-null    object
 1   published       688 non-null    object
 2   summary         688 non-null    object
 3   clean_title     688 non-null    object
 4   clean_summary   688 non-null    object
 5   title_prompt    688 non-null    object
 6   summary_prompt  688 non-null    object
dtypes: object(7)
memory usage: 37.8+ KB


In [91]:

#GEMMA

def predict_title(news, model, tokenizer):
    y_pred_title = []

    for i in tqdm(range(len(news))):
        # Generate prompt from title
        prompt = news.iloc[i]["title_prompt"]
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Predict response
        with torch.no_grad():
            outputs = model.generate(**input_ids, max_new_tokens=1, temperature=0.0)

        # Parse response
        result = tokenizer.decode(outputs[0])
        answer = result.split("=")[-1].lower()

        # Map to sentiment
        if "positive" in answer:
            y_pred_title.append("positive")
        elif "negative" in answer:
            y_pred_title.append("negative")
        elif "neutral" in answer:
            y_pred_title.append("neutral")
        else:
            y_pred_title.append("none")

    # Add predictions as a new column
    news['pred_title'] = y_pred_title

def predict_summary(news, model, tokenizer):
    y_pred_summary = []

    for i in tqdm(range(len(news))):
        # Generate prompt from summary
        prompt = news.iloc[i]["summary_prompt"]
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Predict response
        with torch.no_grad():
            outputs = model.generate(**input_ids, max_new_tokens=1, temperature=0.0)

        # Parse response
        result = tokenizer.decode(outputs[0])
        answer = result.split("=")[-1].lower()

        # Map to sentiment
        if "positive" in answer:
            y_pred_summary.append("positive")
        elif "negative" in answer:
            y_pred_summary.append("negative")
        elif "neutral" in answer:
            y_pred_summary.append("neutral")
        else:
            y_pred_summary.append("none")

    # Add predictions as a new column
    news['pred_summary'] = y_pred_summary

In [92]:
'''
#ROBERTA TRAINED
# Prediction functions
def predict_title(news, model, tokenizer):
    y_pred_title = []

    for i in tqdm(range(len(news))):
        # Get the title directly from the DataFrame
        prompt = news.iloc[i]['title']
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Get the model's prediction
        with torch.no_grad():
            outputs = model(**input_ids)

        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(outputs.logits[0], dim=0)

        # Interpret the probabilities to determine sentiment
        classes = ["negative", "neutral", "positive"]
        sentiment = classes[torch.argmax(probs).item()]
        y_pred_title.append(sentiment)

    return y_pred_title

def predict_summary(news, model, tokenizer):
    y_pred_summary = []

    for i in tqdm(range(len(news))):
        # Get the summary directly from the DataFrame
        prompt = news.iloc[i]['summary']
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Get the model's prediction
        with torch.no_grad():
            outputs = model(**input_ids)

        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(outputs.logits[0], dim=0)

        # Interpret the probabilities to determine sentiment
        classes = ["negative", "neutral", "positive"]
        sentiment = classes[torch.argmax(probs).item()]
        y_pred_summary.append(sentiment)

    return y_pred_summary
    
'''

'\n#ROBERTA TRAINED\n# Prediction functions\ndef predict_title(news, model, tokenizer):\n    y_pred_title = []\n\n    for i in tqdm(range(len(news))):\n        # Get the title directly from the DataFrame\n        prompt = news.iloc[i][\'title\']\n        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")\n\n        # Get the model\'s prediction\n        with torch.no_grad():\n            outputs = model(**input_ids)\n\n        # Convert logits to probabilities\n        probs = torch.nn.functional.softmax(outputs.logits[0], dim=0)\n\n        # Interpret the probabilities to determine sentiment\n        classes = ["negative", "neutral", "positive"]\n        sentiment = classes[torch.argmax(probs).item()]\n        y_pred_title.append(sentiment)\n\n    return y_pred_title\n\ndef predict_summary(news, model, tokenizer):\n    y_pred_summary = []\n\n    for i in tqdm(range(len(news))):\n        # Get the summary directly from the DataFrame\n        prompt = news.iloc[i][\'summary\'

In [93]:
y_pred_title = predict_title(news, model, tokenizer)

100%|██████████| 688/688 [08:19<00:00,  1.38it/s]


In [95]:
y_pred_summary = predict_summary(news, model, tokenizer)

100%|██████████| 688/688 [08:19<00:00,  1.38it/s]


In [97]:
news.head()

Unnamed: 0,title,published,summary,clean_title,clean_summary,title_prompt,summary_prompt,pred_title,pred_summary
0,"SMIC, Tencent, JD.com lead Hong Kong stock los...",2023-09-07 10:26:22+08:00,The Hang Seng Index weakened for a third day a...,smic tencent jd.com lead hong kong stock loss ...,hang seng index weaken day month weak china tr...,You are an experienced financial advisor based...,You are an experienced financial advisor based...,negative,negative
1,Cheap and cheaper: stock buy-backs approaching...,2023-09-19 19:37:59+08:00,Hong Kong-listed companies have spent a combin...,cheap cheap stock buy back approach us$ 10 bil...,hong kong list company spend combine hk$178 bi...,You are an experienced financial advisor based...,You are an experienced financial advisor based...,negative,negative
2,Hong Kong’s IPO environment toughest in over a...,2023-09-22 18:15:24+08:00,Hong Kong’s public fundraising environment is ...,hong kong ipo environment toughest decade rise...,hong kong public fundraising environment tough...,You are an experienced financial advisor based...,You are an experienced financial advisor based...,negative,negative
3,Shanghai to hasten expansion of deep water por...,2023-09-22 21:51:23+08:00,The expansion of deepwater terminals at Yangsh...,shanghai hasten expansion deep water port yang...,expansion deepwater terminal yangshan island g...,You are an experienced financial advisor based...,You are an experienced financial advisor based...,negative,positive
4,China Evergrande applies for trading to resume...,2023-10-02 22:12:09+08:00,China Evergrande Group has applied for trading...,china evergrande apply trading resume hong kon...,china evergrande group apply trading share res...,You are an experienced financial advisor based...,You are an experienced financial advisor based...,negative,negative


In [98]:
news.to_csv('pred_scmp.csv')

In [99]:
news_summary = news[['published','summary','pred_summary']]
news_title = news[['published','title','pred_title']]

In [100]:
news_summary.to_csv('news_summary.csv')
news_title.to_csv('news_title.csv')

In [101]:
news_summary.head()

Unnamed: 0,published,summary,pred_summary
0,2023-09-07 10:26:22+08:00,The Hang Seng Index weakened for a third day a...,negative
1,2023-09-19 19:37:59+08:00,Hong Kong-listed companies have spent a combin...,negative
2,2023-09-22 18:15:24+08:00,Hong Kong’s public fundraising environment is ...,negative
3,2023-09-22 21:51:23+08:00,The expansion of deepwater terminals at Yangsh...,positive
4,2023-10-02 22:12:09+08:00,China Evergrande Group has applied for trading...,negative


In [102]:
def compare_columns(news):
    """Compare the 'pred_summary' and 'pred_title' columns for exact matches."""
    return news['pred_summary'].equals(news['pred_title'])


In [103]:
compare_columns(news)

False