In [23]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.language import Language
nlp = spacy.load('en_core_web_sm')
from functools import lru_cache

In [24]:

df = pd.read_excel('stocksGood.xlsx')
firms = list(df['Column3'])

In [311]:
import pandas as pd
import numpy as np
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from transformers import BertTokenizer, BertModel
import spacy
from spacy.matcher import PhraseMatcher

######################################
# 1. Define Mapping Dictionaries
######################################
# Big event dictionary: each key is an event with its industry-level effects.
big_events = {
    "war": {"defense": 1, "supply_chain": -1},
    "drought": {"agriculture": -1},
    "chip shortage": {"tech": -1},
    "cyclone": {"insurance": -1, "infrastructure": -1},
    "terror attack": {"defense": 1},
    "high interest rates": {"finance": -1, "real_estate": -1},
    "low interest rates": {"finance": 1, "real_estate": 1}
}

# List of industries (sectors) we care about.
sectors = ["defense", "supply_chain", "agriculture", "tech", "insurance", "infrastructure", "finance", "real_estate"]

# Industry to stock mapping.
industry_to_stock = {
    "defense": {"LMT": 1, "BA": 1},             # e.g., Lockheed Martin, Boeing
    "supply_chain": {"UPS": -1, "FDX": -1},       # e.g., UPS, FedEx
    "agriculture": {"DE": -1},                    # e.g., Deere & Co.
    "tech": {"AAPL": -1, "GOOGL": -1, "MSFT": -1}, # e.g., Apple, Google, Microsoft
    "insurance": {"AIG": -1},                     # e.g., AIG
    "infrastructure": {"CAT": -1},                # e.g., Caterpillar
    "finance": {"JPM": -1},                       # e.g., JPMorgan
    "real_estate": {"AMT": 1}                     # e.g., American Tower
}

######################################
# 2. Event Extraction via spaCy
######################################
# Load spaCy English model and build a PhraseMatcher for our event keywords.
nlp_event = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp_event.vocab, attr="LOWER")
event_phrases = list(big_events.keys())
patterns = [nlp_event(text) for text in event_phrases]
matcher.add("BIG_EVENT", patterns)

def extract_events(text):
    """
    Extract all big event keywords found in the text.
    Returns a list of lowercase event strings.
    """
    doc = nlp_event(text)
    matches = matcher(doc)
    detected = set()
    for match_id, start, end in matches:
        span = doc[start:end]
        detected.add(span.text.lower())
    return list(detected)
big_events = {
    "war": {"defense": 1, "supply_chain": -1},
    "drought": {"agriculture": -1},
    "chip shortage": {"tech": -1},
    "cyclone": {"insurance": -1, "infrastructure": -1},
    "terror attack": {"defense": 1},
    "high interest rates": {"finance": -1, "real_estate": -1},
    "low interest rates": {"finance": 1, "real_estate": 1}
}

# List of industries (sectors) we care about.
sectors = ["defense", "supply_chain", "agriculture", "tech", "insurance", "infrastructure", "finance", "real_estate"]

# Industry to stock mapping.
industry_to_stock = {
    "defense": {"LMT": 1, "BA": 1},             # e.g., Lockheed Martin, Boeing
    "supply_chain": {"UPS": -1, "FDX": -1},       # e.g., UPS, FedEx
    "agriculture": {"DE": -1},                    # e.g., Deere & Co.
    "tech": {"AAPL": -1, "GOOGL": -1, "MSFT": -1}, # e.g., Apple, Google, Microsoft
    "insurance": {"AIG": -1},                     # e.g., AIG
    "infrastructure": {"CAT": -1},                # e.g., Caterpillar
    "finance": {"JPM": -1},                       # e.g., JPMorgan
    "real_estate": {"AMT": 1}                     # e.g., American Tower
}

######################################
# 2. Event Extraction via spaCy
######################################
# Load spaCy English model and build a PhraseMatcher for our event keywords.
nlp_event = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp_event.vocab, attr="LOWER")
event_phrases = list(big_events.keys())
patterns = [nlp_event(text) for text in event_phrases]
matcher.add("BIG_EVENT", patterns)

def extract_events(text):
    """
    Extract all big event keywords found in the text.
    Returns a list of lowercase event strings.
    """
    doc = nlp_event(text)
    matches = matcher(doc)
    detected = set()
    for match_id, start, end in matches:
        span = doc[start:end]
        detected.add(span.text.lower())
    return list(detected)

######################################
# 3. Transformer: Map Events to Stock-Level Signal
######################################
class IndustryToStockTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, big_events, sectors, industry_to_stock):
        self.big_events = big_events
        self.sectors = sectors
        self.industry_to_stock = industry_to_stock
        # Get a sorted list of all stocks in the mapping.
        self.stocks = sorted({stock for effects in industry_to_stock.values() for stock in effects})
        # Build a mapping matrix (n_sectors x n_stocks)
        self.mapping_matrix = np.zeros((len(self.sectors), len(self.stocks)))
        for i, sector in enumerate(self.sectors):
            if sector in self.industry_to_stock:
                for j, stock in enumerate(self.stocks):
                    self.mapping_matrix[i, j] = self.industry_to_stock[sector].get(stock, 0)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X is expected to be a pandas Series (text field, e.g., article title)
        event_effects = []
        for text in X:
            events = extract_events(text)
            # Initialize an effect vector for industries.
            vector = np.zeros(len(self.sectors))
            for event in events:
                if event in self.big_events:
                    effects = self.big_events[event]
                    for i, sector in enumerate(self.sectors):
                        if sector in effects:
                            vector[i] += effects[sector]
            event_effects.append(vector)
        event_effects = np.array(event_effects)  # shape: (n_samples, n_sectors)
        # Map the industry-level signal to stock-level signal:
        # (n_samples, n_sectors) dot (n_sectors, n_stocks) -> (n_samples, n_stocks)
        stock_effects = event_effects.dot(self.mapping_matrix)
        return stock_effects

######################################
# 4. BERT Embedding Transformer for Text
######################################
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

class BERTEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X is expected to be a pandas Series.
        inputs = self.tokenizer(X.tolist(), padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state
        # Use the [CLS] token embedding for each sample.
        cls_embeddings = embeddings[:, 0, :]
        return cls_embeddings.numpy()

bert_transformer_title = BERTEmbeddingTransformer(tokenizer=tokenizer, model=model)

######################################
# 5. Data Preparation
######################################
# Here we assume you already have a DataFrame 'df' from your existing pipeline.
# For this example, we'll assume 'df' has the following columns:
# - 'title': Article title or text.
# - 'vol_prop': Numeric volume proportion.
# - 'finbert_score': Numeric sentiment score.
# - 'price_c': The target variable (short-term price change).
#
# Replace the following with your actual DataFrame.
# For demonstration, we use train_data4 that you previously prepared.
df = train_data3.copy()

######################################
# 6. Build the Pipeline
######################################
# Our pipeline will combine:
#  - BERT embedding of the title.
#  - Stock-level event signal extracted from the title.
#  - Other numeric features: vol_prop, finbert_score.
#
# We use a ColumnTransformer to process these different inputs.
preprocessor = ColumnTransformer(
    transformers=[
        ('bert_embedder_title', bert_transformer_title, 'title'),
        ('stock_effect', IndustryToStockTransformer(big_events, sectors, industry_to_stock), 'title'),
        ('passthrough', 'passthrough', ['vol_prop', 'finbert_score'])
    ]
)

# We choose an MLPRegressor (a neural network) for more non-linear modeling.
pipeline = make_pipeline(
    preprocessor,
    StandardScaler(with_mean=False),  # Use with_mean=False to support any sparse or non-dense outputs
    MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
)

######################################
# 7. Training and Evaluation
######################################
X = df[['title', 'vol_prop', 'finbert_score']]
y = df['price_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)

# Optionally, save the trained pipeline.
joblib.dump(pipeline, 'end_to_end_stock_event_model.pkl')

######################################
# 8. Inference Example
######################################
# For example, an article about a war event or political win may not mention specific stocks,
# but our pipeline will extract the event, map it to industries and then to stocks.
example_dic = {
    'title': 'Trump wins the election amidst global tension and rising defense concerns',
    'vol_prop': 1.1,      # example numeric value; adjust as needed
    'finbert_score': 0.8  # example sentiment score; adjust as needed
}
example = pd.DataFrame(example_dic, index=[0])
predicted_change = pipeline.predict(example)
print("Predicted Price Change:", predicted_change)


Mean Absolute Error: 0.15502616422467314
Mean Squared Error: 0.04093554079634608
Predicted Price Change: [0.18438376]


In [310]:
example_dic ={
  "title": "Severe drought in California devastates crop yields and hits agricultural stocks hard",
  "vol_prop": 0.9,
  "finbert_score": -0.5
}
example = pd.DataFrame(example_dic, index=[0])
predicted_change = pipeline.predict(example)
print("Predicted Price Change:", predicted_change)

Predicted Price Change: [-0.99457815]


In [26]:
def better_name(firm_name):
  return firm_name.replace("Inc.", "").replace('Holdings', '').replace('Corp.', '').replace('Solutions','').replace('Services','').replace("Corporation", "").replace(",", "").replace("PLC", "").replace("Ltd.", "").replace('Platforms', '').replace('News','').replace('Institution','').replace('research','').replace('Replace','').replace('news','').replace('advantage','').strip().lower()

name_to_ticker = df.assign(Column4 = df['Column3'].apply(better_name)).set_index('Column4')

In [27]:
list(name_to_ticker['Column2']).index('GOOGL')
name_to_ticker.iloc[7]

Column1                 7
Column2              META
Column3    Meta Platforms
Name: meta, dtype: object

In [28]:
def n_to_t(lst):
  return_lst = []
  lst = [x.lower() for x in lst]
  def helper(x):
    try:
      a = name_to_ticker.loc[x]['Column2']
      if (isinstance(a, str)):
        if ((a!='RSSS') & (a!='BPOP') & (a!='PINC') & (a!='AWRE') & (a!='NICE') & (a!='ADV') & (a!='BKNG') & (a!='DALN') & (a!='STHO')):
          return a
        else:
          return False
      else:
        return name_to_ticker.loc[x]['Column2'][-1]
    except:
      return False

  return_lst = list(map(lambda x: helper(x), lst))
  return return_lst
  #except:
   # return None

In [29]:
n_to_t(['Amazon', 'repay', 'liquidity'])

['AMZN', 'RPAY', 'LQDT']

In [30]:
def remove_rubbish(df):
    def remove_false(lst):
        try:
            if (False in lst):
                while False in lst:
                    lst.remove(False)
                return lst
            else:
                return lst
        except:
            return lst
    try:
        df['ticker'] = df['ticker'].apply(remove_false)
        df = df.dropna(axis='index', how='any')
        return df
    except:
        return None


In [31]:
company_names = firms
def preprocess_company_names(company_names):
    processed_names = []
    for name in company_names:
        core_name = name.replace("Inc.", "").replace('Holdings', '').replace('Corp.', '').replace('Solutions','').replace('Services','').replace("Corporation", "").replace(",", "").replace("PLC", "").replace("Ltd.", "").replace('Platforms', '').replace('News','').replace('Institution','').replace('research','').replace('Replace','').replace('news','').strip()
        processed_names.append(core_name)
    try:
        return list(set(processed_names)) 
    except:
        return None
core_company_names = preprocess_company_names(company_names)

In [32]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

patterns = list(nlp.pipe(core_company_names))
matcher.add("COMPANY", patterns)

@Language.component("company_ner_component")
def company_ner_component(doc):
    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        span = Span(doc, start, end, label="COMPANY")
        spans.append(span)

    doc.ents = spacy.util.filter_spans(spans)
    return doc

nlp.add_pipe("company_ner_component", last=True)

<function __main__.company_ner_component(doc)>

In [33]:
nlp.to_disk('CompanyNER')

In [34]:
def find_companies(text):
  doc = nlp(text)
  detected_firms = set()
  for ent in doc.ents:
    if (ent.label_ == "COMPANY"):
      detected_firms.add(ent.text)
  if (len(detected_firms) == 0):
    return None
  return list(detected_firms)

In [35]:
find_companies('Alphabet and Apple and Amazon was sued')

['Amazon', 'Apple', 'Alphabet']

Over the scraped data 

In [36]:
financial_news = pd.read_csv("new-complete-scraped-articles.csv")
#b = financial_news[financial_news['title'].str.contains('Amazon')]

financial_news = financial_news.assign(firms = financial_news['title'].apply(find_companies))
financial_news = financial_news.dropna(axis='index', how='any')
financial_news = financial_news.assign(ticker = financial_news['firms'].apply(n_to_t))

  return name_to_ticker.loc[x]['Column2'][-1]


In [37]:
financial_news

Unnamed: 0,title,text,publish_date,url,source_brand,firms,ticker
14,Starboard takes a stake in Qorvo. How the acti...,Qorvo is a global semiconductor company that s...,2025-01-25 11:48:29,https://www.cnbc.com/2025/01/25/starboard-take...,www.cnbc.com,[Qorvo],[QRVO]
17,Alphabet shares close above $200 for first tim...,Alphabet shares closed at $200 per share for t...,2025-01-24 21:06:53,https://www.cnbc.com/2025/01/24/alphabet-share...,www.cnbc.com,[Alphabet],[GOOGL]
20,Zuckerberg sets Meta's AI targets for the year...,Meta CEO Mark Zuckerberg on Friday announced t...,2025-01-24 15:41:18,https://www.cnbc.com/2025/01/24/zuckerberg-set...,www.cnbc.com,[Meta],[META]
23,"Meta to begin testing ads on Threads, its micr...",Meta will begin testing ads on its Threads mic...,2025-01-24 18:54:36,https://www.cnbc.com/2025/01/24/meta-to-begin-...,www.cnbc.com,[Meta],[META]
29,Tesla starts sales of revamped Model Y in U.S....,Tesla will start deliveries of a revamped vers...,2025-01-24 05:26:58,https://www.cnbc.com/2025/01/23/tesla-starts-s...,www.cnbc.com,[Tesla],[TSLA]
...,...,...,...,...,...,...,...
5758,Q4 2024 Northwest Bancshares Inc Earnings Call,"Douglas Schosser\n\nThank you, Lou, and good m...",2025-01-25 08:14:13-05:00,https://finance.yahoo.com/news/q4-2024-northwe...,Yahoo Finance,[Northwest Bancshares],[NWBI]
5759,Northwest Bancshares (NWBI) Q4 Earnings: Takin...,Efficiency Ratio: 61.8% versus 64.6% estimated...,2025-01-24 17:30:09-05:00,https://finance.yahoo.com/news/northwest-bancs...,Yahoo Finance,[Northwest Bancshares],[NWBI]
5760,Northwest Bancshares (NWBI) Q4 Earnings and Re...,Sign in to access your portfolio\n\nSign in,2025-01-24 16:40:05-05:00,https://finance.yahoo.com/news/northwest-bancs...,Yahoo Finance,[Northwest Bancshares],[NWBI]
5761,Northwest Bancshares: Q4 Earnings Snapshot,"In This Article:\n\nCOLUMBUS, Ohio (AP) — COLU...",2025-01-24 15:48:19-05:00,https://finance.yahoo.com/news/northwest-bancs...,Yahoo Finance,[Northwest Bancshares],[NWBI]


In [38]:
train_data = financial_news.explode('ticker')

In [39]:
train_data = train_data.dropna(axis='index', how='any')

In [40]:
train_data['combined'] = train_data.apply(lambda row: [row['ticker'], row['publish_date']], axis=1)

In [41]:
train_data

Unnamed: 0,title,text,publish_date,url,source_brand,firms,ticker,combined
14,Starboard takes a stake in Qorvo. How the acti...,Qorvo is a global semiconductor company that s...,2025-01-25 11:48:29,https://www.cnbc.com/2025/01/25/starboard-take...,www.cnbc.com,[Qorvo],QRVO,"[QRVO, 2025-01-25 11:48:29]"
17,Alphabet shares close above $200 for first tim...,Alphabet shares closed at $200 per share for t...,2025-01-24 21:06:53,https://www.cnbc.com/2025/01/24/alphabet-share...,www.cnbc.com,[Alphabet],GOOGL,"[GOOGL, 2025-01-24 21:06:53]"
20,Zuckerberg sets Meta's AI targets for the year...,Meta CEO Mark Zuckerberg on Friday announced t...,2025-01-24 15:41:18,https://www.cnbc.com/2025/01/24/zuckerberg-set...,www.cnbc.com,[Meta],META,"[META, 2025-01-24 15:41:18]"
23,"Meta to begin testing ads on Threads, its micr...",Meta will begin testing ads on its Threads mic...,2025-01-24 18:54:36,https://www.cnbc.com/2025/01/24/meta-to-begin-...,www.cnbc.com,[Meta],META,"[META, 2025-01-24 18:54:36]"
29,Tesla starts sales of revamped Model Y in U.S....,Tesla will start deliveries of a revamped vers...,2025-01-24 05:26:58,https://www.cnbc.com/2025/01/23/tesla-starts-s...,www.cnbc.com,[Tesla],TSLA,"[TSLA, 2025-01-24 05:26:58]"
...,...,...,...,...,...,...,...,...
5758,Q4 2024 Northwest Bancshares Inc Earnings Call,"Douglas Schosser\n\nThank you, Lou, and good m...",2025-01-25 08:14:13-05:00,https://finance.yahoo.com/news/q4-2024-northwe...,Yahoo Finance,[Northwest Bancshares],NWBI,"[NWBI, 2025-01-25 08:14:13-05:00]"
5759,Northwest Bancshares (NWBI) Q4 Earnings: Takin...,Efficiency Ratio: 61.8% versus 64.6% estimated...,2025-01-24 17:30:09-05:00,https://finance.yahoo.com/news/northwest-bancs...,Yahoo Finance,[Northwest Bancshares],NWBI,"[NWBI, 2025-01-24 17:30:09-05:00]"
5760,Northwest Bancshares (NWBI) Q4 Earnings and Re...,Sign in to access your portfolio\n\nSign in,2025-01-24 16:40:05-05:00,https://finance.yahoo.com/news/northwest-bancs...,Yahoo Finance,[Northwest Bancshares],NWBI,"[NWBI, 2025-01-24 16:40:05-05:00]"
5761,Northwest Bancshares: Q4 Earnings Snapshot,"In This Article:\n\nCOLUMBUS, Ohio (AP) — COLU...",2025-01-24 15:48:19-05:00,https://finance.yahoo.com/news/northwest-bancs...,Yahoo Finance,[Northwest Bancshares],NWBI,"[NWBI, 2025-01-24 15:48:19-05:00]"


In [80]:
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import pytz

def round_to_nearest_past_hour(dt):
    """
    Rounds the given datetime to the nearest past hour.
    If outside market hours (9:30 AM - 4:00 PM EST) or on a weekend, return None.
    Assumes the datetime is in EST.
    """
    # Check if the day is a weekend (Saturday = 5, Sunday = 6)
    if dt.weekday() >= 5:  # Saturday (5) or Sunday (6)
        return None

    # Define market open and close times in EST
    market_open = dt.replace(hour=9, minute=30, second=0, microsecond=0)
    market_close = dt.replace(hour=16, minute=0, second=0, microsecond=0)

    minutes = (dt.minute // 30) * 30
    rounded_dt = dt.replace(minute=minutes, second=0, microsecond=0)

    # If it's before market open or after market close, return None
    if (rounded_dt==None):
    
        df = yf.download(ticker, start=rounded_dt.date() - timedelta(days=7), end=(rounded_dt).date(), interval='60m')

        # If data is available for the rounded date, return the rounded datetime
        if not df.empty:
            return rounded_dt

        # If no data for the rounded date, get last month's data to find the closest day
        start_date = (rounded_dt - timedelta(days=30)).strftime("%Y-%m-%d")
        df_last_month = yf.download(ticker, start=start_date, end=rounded_dt.strftime("%Y-%m-%d"))

        # If no data available for the last month, return None
        if df_last_month.empty:
            return None

        # Find the closest available trading day to the specified date
        closest_day = df_last_month[df_last_month.index <= rounded_dt].tail(1)
        print(closest_day)

        if closest_day.empty:
            return None

        # Return the datetime of the closest available trading day
        return closest_day.index[-1]

    return rounded_dt

def get_stock_data(ticker, start, end, interval="60m"):
    """Fetch stock data from Yahoo Finance."""
    #print(start, end)
    return yf.download(ticker, start=start, end=end, interval=interval)

def calculate_volume_ratio(lst):
    try:
        ticker = lst[0]
        
        dt = datetime.strptime(lst[1][:19], "%Y-%m-%d %H:%M:%S")
        """Calculate today's volume spike ratio based on historical data."""
        rounded_dt = round_to_nearest_past_hour(dt)
        if rounded_dt is None:
            return None
        
        # Define the time range
        start_of_day = rounded_dt.replace(hour=9, minute=30)
        start_date = (rounded_dt - timedelta(days=31)).date()
        end_date = rounded_dt.date()
        
        # Fetch historical data
        df = get_stock_data(ticker, start=start_date, end=end_date)
        #print(df)
        #print(df.head())
        if df.empty:
            return None
        
        # Ensure DatetimeIndex and filter market hours
        #df.index = df.index.tz_localize(None)
        #df = df.between_time("09:30", rounded_dt.strftime("%H:%M"))
        
        
        # Calculate today's cumulative volume till rounded time
        #print('Index dated : ', df.index.date)
        #print(df.index.date)
        #print('Round', rounded_dt.date())
        try:
            today_volume = yf.download(ticker, start=rounded_dt.date(), end=rounded_dt.date() + timedelta(days=1))['Volume'].iloc[0].values[0]
        except:
            return None
        #today_volume = df.loc[df.index.date == rounded_dt.date()]["Volume"].sum()
        #print(today_volume)
        
        # Calculate average volume from 9:30 AM to rounded time over last month
        #print(df)
        #print(df)
        past_volumes = []
        for date in df.index.date:
            if date < rounded_dt.date():
                past_volumes.append(df[df.index.date == date]["Volume"].sum())
            #print(past_volumes)
        
        avg_past_volume = sum(past_volumes) / len(past_volumes) if past_volumes else None
        avg_past_volume = avg_past_volume.iloc[0]
        #print(avg_past_volume)
        
        # Compute volume spike ratio
        if avg_past_volume and avg_past_volume > 0:
            #print('Today Volume : ', today_volume)
            return today_volume / avg_past_volume
        return None
    
    except:
        return None
    
#@lru_cache(maxsize=None)
def price_change(lst):
    try:
        ticker = lst[0]
        dt = datetime.strptime(lst[1][:19], "%Y-%m-%d %H:%M:%S")
        rounded_dt = round_to_nearest_past_hour(dt)
        if rounded_dt is None:
            return None
        dld = yf.download(ticker, start=rounded_dt.date(), end=rounded_dt.date() + timedelta(days=1))
        today_price_change = dld['Close'].iloc[0].values[0] - dld['Open'].iloc[0].values[0]
        return today_price_change / dld['Open'].iloc[0].values[0]
    except:
        return None



ticker = "AMZN"
dt = '2025-01-14 2:41:18'
ratio = calculate_volume_ratio([ticker, dt])
print(price_change([ticker, dt]))
print("Volume Ratio:", ratio)

#print(round_to_nearest_past_hour(datetime(2024, 3, 23, 18, 38)))
#x = round_to_nearest_past_hour(datetime(2024, 3, 23, 18, 38))


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

-0.012157539035060881
Volume Ratio: 1.4409292992235434





In [214]:
train_data = train_data.iloc[:1000].assign(price_c = train_data.iloc[:1000]['combined'].apply(price_change))
train_data1 = train_data.iloc[:1000].assign(vol_prop = train_data.iloc[:1000]['combined'].apply(calculate_volume_ratio))
#train_data2 = train_data1.iloc[:500].assign(price_percent = train_data1.iloc[:500]['combined'].apply(price_change))


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [216]:
sum(train_data1['price_c'].isna())

46

In [217]:
train_data1.dropna()

Unnamed: 0,title,text,publish_date,url,source_brand,firms,ticker,combined,price_c,vol_prop
17,Alphabet shares close above $200 for first tim...,Alphabet shares closed at $200 per share for t...,2025-01-24 21:06:53,https://www.cnbc.com/2025/01/24/alphabet-share...,www.cnbc.com,[Alphabet],GOOGL,"[GOOGL, 2025-01-24 21:06:53]",0.010651,1.764211
20,Zuckerberg sets Meta's AI targets for the year...,Meta CEO Mark Zuckerberg on Friday announced t...,2025-01-24 15:41:18,https://www.cnbc.com/2025/01/24/zuckerberg-set...,www.cnbc.com,[Meta],META,"[META, 2025-01-24 15:41:18]",0.017426,2.814496
23,"Meta to begin testing ads on Threads, its micr...",Meta will begin testing ads on its Threads mic...,2025-01-24 18:54:36,https://www.cnbc.com/2025/01/24/meta-to-begin-...,www.cnbc.com,[Meta],META,"[META, 2025-01-24 18:54:36]",0.017426,2.814496
29,Tesla starts sales of revamped Model Y in U.S....,Tesla will start deliveries of a revamped vers...,2025-01-24 05:26:58,https://www.cnbc.com/2025/01/23/tesla-starts-s...,www.cnbc.com,[Tesla],TSLA,"[TSLA, 2025-01-24 05:26:58]",-0.018989,0.948096
88,Netflix has big ambitions for live sport,"The HOLIDAY season is a time for family, food—...",2025-01-02 14:06:25,https://www.economist.com/business/2025/01/02/...,www.economist.com,[Netflix],NFLX,"[NFLX, 2025-01-02 14:06:25]",-0.009793,1.260126
105,"Tesla, Intel and the fecklessness of corporate...",SITTING ON THE board of a large American compa...,2024-12-12 13:39:21,https://www.economist.com/business/2024/12/12/...,www.economist.com,"[Intel, Tesla]",INTC,"[INTC, 2024-12-12 13:39:21]",0.037962,1.206777
105,"Tesla, Intel and the fecklessness of corporate...",SITTING ON THE board of a large American compa...,2024-12-12 13:39:21,https://www.economist.com/business/2024/12/12/...,www.economist.com,"[Intel, Tesla]",TSLA,"[TSLA, 2024-12-12 13:39:21]",-0.015865,0.996442
108,The PayPal Mafia is taking over America’s gove...,On the night of December 7th San Francisco’s P...,2024-12-10 17:55:20,https://www.economist.com/business/2024/12/10/...,www.economist.com,[PayPal],PYPL,"[PYPL, 2024-12-10 17:55:20]",-0.01312,0.896651
116,"Intel’s troubles deepen, as its boss makes an ...",When Pat GELSINGER took over as Intel’s chief ...,2024-12-02 21:26:20,https://www.economist.com/business/2024/12/02/...,www.economist.com,[Intel],INTC,"[INTC, 2024-12-02 21:26:20]",-0.036634,2.543274
128,Nvidia’s boss dismisses fears that AI has hit ...,"WHEN SAM ALTMAN, boss of OpenAI, posted a gnom...",2024-11-21 10:21:21,https://www.economist.com/business/2024/11/21/...,www.economist.com,[Nvidia],NVDA,"[NVDA, 2024-11-21 10:21:21]",-0.017944,2.177106


In [218]:
from transformers import pipeline
pipe1 = pipeline("text-classification", model="ProsusAI/finbert")


def finbert(text):
        
    text = text[:1500]
    d = pipe1(text)[0]
    if (d['label']=='neutral'):
        return 0
    elif (d['label']=='positive'):
        return d['score']
    else:
        return -d['score']

Device set to use mps:0


In [220]:
train_data3 = train_data1.dropna().assign(finbert_score = train_data1.dropna()['title'].apply(finbert)) 

In [318]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
from transformers import BertTokenizer, BertModel
import joblib 

# Load BERT tokenizer and model **before** creating transformers
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Custom BERT transformer class
class BERTEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer, model):
        
        self.tokenizer = tokenizer
        self.model = model

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        inputs = self.tokenizer(X.tolist(), padding=True, truncation=True, return_tensors="pt")
        
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        embeddings = outputs.last_hidden_state
        
        
        cls_embeddings = embeddings[:, 0, :]
        
        return cls_embeddings.numpy()



df = train_data3.dropna()

X = df[['title', 'vol_prop', 'finbert_score']]
y = df['price_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=2)


bert_transformer_title = BERTEmbeddingTransformer(tokenizer=tokenizer, model=model)
bert_transformer_text = BERTEmbeddingTransformer(tokenizer=tokenizer, model=model)

# Define ColumnTransformer
text_preprocessor = ColumnTransformer(
    transformers=[
        ('bert_embedder_title', bert_transformer_title, 'title'),
        #('bert_embedder_text', bert_transformer_text, 'text'),
        #('scaler', StandardScaler(), ['vol_prop'])
    ],
    remainder='passthrough'  
)

# Create the pipeline
pipeline = make_pipeline(
    text_preprocessor, 
    StandardScaler(with_mean=False), 
    LinearRegression() 
)

#Fit and test the pipeline
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error: ", mse)
joblib.dump(pipeline, 'regression_model_bert4.pkl')


Mean Absolute Error: 0.018372694372862833
Mean Squared Error:  0.0006453789109072577


['regression_model_bert4.pkl']

In [261]:
news = """

The Centers for Disease Control and Prevention is pulling back $11.4 billion in funds allocated in response to the pandemic to state and community health departments, nongovernment organizations and international recipients, the Department of Health and Human Services confirmed Tuesday.

"The COVID-19 pandemic is over, and HHS will no longer waste billions of taxpayer dollars responding to a non-existent pandemic that Americans moved on from years ago," HHS Director of Communications Andrew Nixon said in a statement. "HHS is prioritizing funding projects that will deliver on President Trump’s mandate to address our chronic disease epidemic and Make America Healthy Again."

HHS oversees 13 agencies, including the CDC, which is tasked with protecting the nation’s health. Notices began going out Monday, and awardees have 30 days to reconcile their expenditures. Figures are subject to change.

The funding slash comes on the heels of other cuts under new Health and Human Services Secretary Robert F. Kennedy Jr., including the canceling of hundreds of millions of dollars in grants for research into vaccine hesitancy, LGBTQ populations and supporting HIV prevention.

“Now that the pandemic is over, the grants and cooperative agreements are no longer necessary as their limited purpose has run out," read notices of termination sent to grantees Monday and shared with NBC News.

The federal public health emergency ended May 11, 2023, but more than 1.2 million people in the United States have died from Covid, according to the CDC. Though infection has slowed and the disease has become milder, hundreds of people still die every week from Covid, and long Covid symptoms continue to cause debilitating medical problems in some cases. 

The clawed-back funds were largely being used for Covid testing, vaccination, community health workers and initiatives to address Covid health disparities among high-risk and underserved populations, including racial and ethnic minority populations and rural communities, as well as global Covid projects, according to talking points CDC leadership emailed to agency departments Tuesday.

The CDC reviewed a list of HHS-provided Covid grants and cooperative agreements and identified the programs that were no longer needed, according to the talking points.

A list of projects for the rescinded funds was not immediately available.



"""

In [315]:
#example_dic = {'title': 'How Google and NVIDIA are teaming up to solve real-world problems with AI', 'text': news, 'vol_prop' : calculate_volume_ratio(['GOOGL', '2025-03-21 15:08:05']),'finbert_score' : finbert('How Google and NVIDIA are teaming up to solve real-world problems with AI'), 'Fin_high':[0]}#, 'vol_prop' : calculate_volume_ratio(['GOOGL', '2025-03-21 15:08:05'])}
example_dic = {'title': 'CDC is pulling back $11B in Covid funding sent to health departments across the U.S.', 'text': news, 'vol_prop' : 0.5,'finbert_score' : finbert('CDC is pulling back $11B in Covid funding sent to health departments across the U.S.')}#, 'vol_prop' : calculate_volume_ratio(['GOOGL', '2025-03-21 15:08:05'])}
example = pd.DataFrame(example_dic, index=[1])
example

Unnamed: 0,title,text,vol_prop,finbert_score
1,CDC is pulling back $11B in Covid funding sent...,\n\nThe Centers for Disease Control and Preven...,0.5,-0.969065


In [316]:
pipeline.predict(example)

array([-0.00081023])

In [317]:
model = pipeline.named_steps['linearregression']  

print("Model coefficients (weights):", model.coef_)

print("Model intercept:", model.intercept_)

Model coefficients (weights): [ 4.13659972e-05  3.19077170e-05 -1.59666817e-04  4.22299252e-05
 -1.03040594e-04 -2.25063987e-05 -7.82520279e-05  2.57180856e-04
  6.96297660e-05  5.64834218e-05  5.69636594e-06 -3.49561183e-05
 -5.55573738e-06 -1.73456099e-04 -1.33436078e-04 -5.82501501e-05
 -6.48916394e-05 -1.76691354e-06 -1.49201190e-04 -4.89907909e-05
 -1.91898763e-04  1.03540763e-04 -4.43385532e-05 -4.81169858e-06
  7.04232925e-05  1.18782451e-05 -6.86347367e-05 -1.80804950e-04
  1.52547926e-05  2.59482770e-04 -4.01426780e-05  1.31284411e-05
 -5.00428123e-05 -9.27782509e-05 -2.82264654e-05 -1.83523085e-04
 -5.51921974e-06 -9.96110142e-05 -8.38770673e-05 -1.87929923e-05
 -1.55918453e-05  1.77584166e-05  8.47620857e-05 -7.59815302e-05
 -2.06951968e-04  4.04047991e-05 -8.70436535e-05  3.15722950e-05
  9.74047657e-05  7.26470063e-05 -1.12331948e-04  3.79035574e-05
 -2.29523402e-04 -1.40429361e-04  9.12293914e-05  1.45866980e-04
  1.41930853e-04 -2.14381886e-05 -1.23324732e-04  5.75740027