In [None]:
import newspaper
from newspaper import Article
import yfinance as yf
import os
import csv
from datetime import datetime, timedelta
from functools import cache
import pandas as pd
from MasterLogic import master_input

# Persistent storage file for visited links
VISITED_LINKS_FILE = "visited_links.csv"

# Load visited links from file
def load_visited_links():
    if os.path.exists(VISITED_LINKS_FILE):
        with open(VISITED_LINKS_FILE, "r") as file:
            print('done LVL 1')
            return set(row[0] for row in csv.reader(file))
    return set()

# Save a single visited link to file
def save_visited_link(link):
    print('doing LVL 2')
    with open(VISITED_LINKS_FILE, "a", newline="") as file:
        writer = csv.writer(file)
        writer.writerow([link])
        print('done')

# Analyze the article (placeholder for actual implementation)
def analyze_article(title, text, publish_date, url, source_brand):
    # If CSV file is empty, add headers
    if not os.path.exists("news.csv") or os.stat('news.csv').st_size == 0:
        with open('news.csv', 'a') as f:
            writer = csv.writer(f)
            writer.writerow(["title", "text", "publish_date", "url", "source_brand"])
            print('done')
    
    # Save to CSV dataframe
    with open('news.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow([title, text, publish_date, url, source_brand])
    
    # When we have 20 articles, call the run_model function
    df = pd.read_csv('news.csv')
    if len(df) >= 20:
        predictions = master_input(df)
        current = pd.read_csv('analysis.csv')
        merged = pd.concat([current, predictions], ignore_index=True)
        merged.to_csv('analysis.csv', index=False)
        # Clear the file
        open('news.csv', 'w').close()
        print('done')
    
def predict(title, text):
    new_data = pd.DataFrame({
    'title': [title],
    'text': [text]
    })
    return np.round(pipeline.predict(new_data),2)

# Process articles from a given source
def process_articles(source, visited_links):
    for article in source.articles:
        if article.url in visited_links:
            continue

        try:
            article.download()
            article.parse()

            # Add to visited links and save immediately
            visited_links.add(article.url)
            save_visited_link(article.url)

            # Call the analyze function
            analyze_article(
                title=article.title,
                text=article.text,
                publish_date=article.publish_date,
                url=article.url,
                source_brand=source.brand
            )
        except Exception as e:
            print(f"Failed to process article {article.url}: {e}")



    

def process_links(links):
    news_item = {
        "title": [],
        "text": [],
        "publish_date": [],
        "url": [],
        "tickers": [],
        "predict": []
    }

    for link in links:
        news_article = Article(link)
        try:
            news_article.download()
            news_article.parse()
            
            # Ensure we always append values for every key
            title = news_article.title if news_article.title else 'Dummy Title'
            text = news_article.text if news_article.text else 'Dummy Text'
            publish_date = news_article.publish_date if news_article.publish_date else 'Dummy Date'
            tickers = find_companies(news_article.text.replace('popular','').replace('advantage', '').replace('root', '').replace('liquidity').replace('repay'))  # Get tickers from the article

            if not tickers:  
                tickers = ['Dummy Co.']  # Ensure at least one placeholder value

            prediction = predict(news_article.title, news_article.text)
            
            # Append values
            for ticker in tickers:
                news_item["title"].append(title)
                news_item["text"].append(text)
                news_item["publish_date"].append(publish_date)
                news_item["url"].append(link)
                news_item["tickers"].append(ticker)  # Now ensures a single ticker per row
                news_item["predict"].append(prediction)  # Ensure one prediction per ticker

        except Exception as e:
            print(f"Failed to process {link}: {e}")

    return news_item

        
        




# List of financial news sources
financial_news_sources = [
    'https://www.cnbc.com',                     # CNBC
    #'https://www.reuters.com/finance',         # Reuters
    #'https://www.bloomberg.com',               # Bloomberg
    #'https://www.marketwatch.com',             # MarketWatch
    #'https://www.ft.com/',                     # Financial Times
    #'https://www.investing.com/',              # Investing.com
    #'https://www.forbes.com/finance/',         # Forbes - Finance Section
    #'https://www.theguardian.com/business',     # The Guardian - Business Section
]

stocks_data = pd.read_csv("stocks.csv", header=None)  # Assuming the file has no headers

# Extract tickers from the second column
stock_tickers = stocks_data.iloc[:, 1].dropna().astype(str).unique()

def main():
    # Adjust the target date as needed (e.g., two days ago)
    today = datetime.today().date() - timedelta(days=2)
    
    # Load visited links to avoid reprocessing
    visited_links = load_visited_links()
    print(len(visited_links))
    print("Starting news scraping...")

    # Process general news sources (top financial news)
    '''
    for source_url in financial_news_sources:
        print(f"Processing source: {source_url}")
        try:
            source = newspaper.build(source_url, memoize_articles=False)
            process_articles(source, visited_links)
        except Exception as e:
            print(f"Failed to process source {source_url}: {e}")
    
    print('--Done Scraping---')'
    '''

    output = process_links(list(visited_links))

    print(visited_links)

    print('Title', len(output['title']))
    print('Text', len(output['text']))
    print('url', len(output['url']))
    print('predict', len(output['predict']))
    print('tickers', len(output['tickers']))

    df = pd.DataFrame(output)
    df.to_csv('dummy_analysis.csv')


if __name__ == "__main__":
    main()


done LVL 1
360
Starting news scraping...
Failed to process https://www.bloomberg.com/news/articles/2025-01-17/spacex-starship-explosion-triggers-faa-mishap-investigation?srnd=homepage-americas: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.bloomberg.com/news/articles/2025-01-17/spacex-starship-explosion-triggers-faa-mishap-investigation?srnd=homepage-americas on URL https://www.bloomberg.com/news/articles/2025-01-17/spacex-starship-explosion-triggers-faa-mishap-investigation?srnd=homepage-americas
Failed to process https://www.forbes.com/sites/antoniopequenoiv/2025/01/24/flood-watch-issued-for-fire-stricken-los-angeles-areas-heres-why-rain-might-be-an-issue/: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/antoniopequenoiv/2025/01/24/flood-watch-issued-for-fire-stricken-los-angeles-areas-heres-why-rain-might-be-an-issue/ on URL https://www.forbes.com/sites/antoniopequenoiv/2025/01/24

In [90]:

df = pd.read_excel('stocksGood.xlsx')
firms = list(df['Column3'])

def better_name(firm_name):
  return firm_name.replace("Inc.", "").replace('Holdings', '').replace('Corp.', '').replace('Solutions','').replace('Services','').replace("Corporation", "").replace(",", "").replace("PLC", "").replace("Ltd.", "").replace('Platforms', '').replace('News','').replace('Institution','').replace('research','').replace('Replace','').replace('news','').replace('advantage','').strip().lower()

name_to_ticker = df.assign(Column4 = df['Column3'].apply(better_name)).set_index('Column4')

def n_to_t(lst):
  return_lst = []
  lst = [x.lower() for x in lst]
  def helper(x):
    try:
      a = name_to_ticker.loc[x]['Column2']
      if (isinstance(a, str)):
        if ((a!='RSSS') & (a!='BPOP') & (a!='PINC') & (a!='AWRE') & (a!='NICE') & (a!='ADV') & (a!='BKNG') & (a!='DALN') & (a!='STHO')):
          return a
        else:
          return False
      else:
        return name_to_ticker.loc[x]['Column2'][-1]
    except:
      return False

  return_lst = list(map(lambda x: helper(x), lst))
  return_lst = list(filter(lambda x: x!=False, return_lst))
  return return_lst
  #except:
   # return None

In [58]:
n_to_t(['Amazon', 'liquidity'])

['AMZN', 'LQDT']

In [151]:
import pandas as pd

df = pd.read_csv('dummy_analysis.csv')
df = df[df['tickers'] != 'Dummy Co.']
df['percentpricechanges'] = df['predict']
df['headline'] = df['title']
df

Unnamed: 0.1,Unnamed: 0,title,text,publish_date,url,tickers,predict,percentpricechanges,headline
5,5,Best bad credit personal loans of April 2025,Looking to consolidate debt or make home impro...,2021-03-18 21:00:08+00:00,https://www.cnbc.com/select/best-personal-loan...,repay,[1.62567],[1.62567],Best bad credit personal loans of April 2025
6,6,Best bad credit personal loans of April 2025,Looking to consolidate debt or make home impro...,2021-03-18 21:00:08+00:00,https://www.cnbc.com/select/best-personal-loan...,Upstart,[1.62567],[1.62567],Best bad credit personal loans of April 2025
20,20,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Amazon,[-2.4287596],[-2.4287596],Costco checks plenty of key boxes for investor...
21,21,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Costco Wholesale,[-2.4287596],[-2.4287596],Costco checks plenty of key boxes for investor...
24,24,Broadcom shares soar 16% as earnings top estim...,Broadcom reported first-quarter earnings on Th...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/broadcom-avgo-...,Broadcom,[1.9902283],[1.9902283],Broadcom shares soar 16% as earnings top estim...
...,...,...,...,...,...,...,...,...,...
254,254,MongoDB plummets nearly 27% for worst day ever...,MongoDB shares cratered more than 26.9% for th...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/mongodb-shares...,MongoDB,[-6.592944],[-6.592944],MongoDB plummets nearly 27% for worst day ever...
256,256,Analysts are upping bets on these stocks as th...,There is a small cohort of stocks that have dr...,2025-03-29 00:00:00,https://www.cnbc.com/2025/03/29/analysts-are-u...,Nvidia,[-11.728099],[-11.728099],Analysts are upping bets on these stocks as th...
257,257,Analysts are upping bets on these stocks as th...,There is a small cohort of stocks that have dr...,2025-03-29 00:00:00,https://www.cnbc.com/2025/03/29/analysts-are-u...,Warner Bros. Discovery,[-11.728099],[-11.728099],Analysts are upping bets on these stocks as th...
258,258,Analysts are upping bets on these stocks as th...,There is a small cohort of stocks that have dr...,2025-03-29 00:00:00,https://www.cnbc.com/2025/03/29/analysts-are-u...,Intel,[-11.728099],[-11.728099],Analysts are upping bets on these stocks as th...


In [193]:
df2_ = df.groupby('url').agg({
    'headline': lambda x: max(x),
    'text': lambda x: summarize(max(x)),
    'publish_date': lambda x: max(x),
    'tickers': lambda x: ','.join(map(str, n_to_t(x.to_list())[:4])),  # Convert to string
    'predict': lambda x: ','.join(map(lambda y: str(round(y, 4)), x[:4]))  # Convert rounded float to string
}).reset_index()

df2_[:16].to_csv('analysis7.csv')

  return name_to_ticker.loc[x]['Column2'][-1]


In [195]:
df2_[:16].to_csv('analysisFinal.csv')

In [189]:
df2.iloc[:15]

Unnamed: 0,url,headline,text,publish_date,tickers,predict
0,https://www.cnbc.com/2025/03/05/crowdstrike-sh...,CrowdStrike slumps more than 6% on weak earnin...,CrowdStrike shares fell 6.3% due to weak earni...,2025-03-05 00:00:00,CRWD,-0.1544
1,https://www.cnbc.com/2025/03/05/foot-locker-fl...,Foot Locker results show the sneaker industry ...,Foot Locker expects continued deep sneaker dis...,2025-03-05 00:00:00,"CROX,BANR","0.8567,0.8477"
2,https://www.cnbc.com/2025/03/05/mongodb-mdb-q4...,MongoDB shares sink after company issues weak ...,MongoDB shares dropped 16% due to weak guidanc...,2025-03-05 00:00:00,MDB,0.0206
3,https://www.cnbc.com/2025/03/06/broadcom-avgo-...,Broadcom shares soar 16% as earnings top estim...,Broadcom beat earnings and revenue estimates w...,2025-03-06 00:00:00,AVGO,0.2835
4,https://www.cnbc.com/2025/03/06/broadcom-q1-ea...,Broadcom's report gives the battered AI trade ...,Broadcom's strong earnings and AI guidance que...,2025-03-06 00:00:00,"GOOGL,AVGO,INTC,AAPL","0.118,0.1365,0.116,0.1205"
5,https://www.cnbc.com/2025/03/06/costco-checks-...,Costco checks plenty of key boxes for investor...,"Costco's revenue beat expectations, showing lo...",2025-03-06 00:00:00,"AMZN,COST","-0.0476,-0.0558"
6,https://www.cnbc.com/2025/03/06/gap-gap-earnin...,Gap shares spike 17% as retailer blows away ex...,"Gap's turnaround, led by CEO Dickson, exceeded...",2025-03-06 00:00:00,"MAT,BANR","-0.0743,-0.0769"
7,https://www.cnbc.com/2025/03/06/hewlett-packar...,"HPE to cut 2,500 employees as stock slides 19%...",HPE shares fell 19% after weaker-than-expected...,2025-03-06 00:00:00,NVDA,0.7187
8,https://www.cnbc.com/2025/03/06/macys-m-q4-202...,Macy's turnaround hinges on revamping some sto...,Macy's mixed holiday sales fuel turnaround pre...,2025-03-06 00:00:00,BANR,0.4712
9,https://www.cnbc.com/2025/03/06/marvell-shares...,Marvell plunges nearly 20% as outlook falls sh...,Marvell shares dropped almost 20% due to disap...,2025-03-06 00:00:00,"MRVL,AVGO,AMZN,NVDA","-0.0163,0.0017,0.009,0.0089"


In [29]:
import google.generativeai as genai

def summarize(text):
    genai.configure(api_key="AIzaSyD3ooJaVA_oHbfmi2CCIZlbbkPC1vdYmDw")

    prompt = f"Summarize the text to less than 20 words : {text[:400]}"

    try:
        model = genai.GenerativeModel('gemini-2.0-flash')
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating summary: {str(e)}"

In [92]:
df

Unnamed: 0.1,Unnamed: 0,title,text,publish_date,url,tickers,predict,percentpricechanges,headline
5,5,Best bad credit personal loans of April 2025,Looking to consolidate debt or make home impro...,2021-03-18 21:00:08+00:00,https://www.cnbc.com/select/best-personal-loan...,repay,[1.62567],[1.62567],Best bad credit personal loans of April 2025
6,6,Best bad credit personal loans of April 2025,Looking to consolidate debt or make home impro...,2021-03-18 21:00:08+00:00,https://www.cnbc.com/select/best-personal-loan...,Upstart,[1.62567],[1.62567],Best bad credit personal loans of April 2025
20,20,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Amazon,[-2.4287596],[-2.4287596],Costco checks plenty of key boxes for investor...
21,21,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Costco Wholesale,[-2.4287596],[-2.4287596],Costco checks plenty of key boxes for investor...
24,24,Broadcom shares soar 16% as earnings top estim...,Broadcom reported first-quarter earnings on Th...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/broadcom-avgo-...,Broadcom,[1.9902283],[1.9902283],Broadcom shares soar 16% as earnings top estim...
...,...,...,...,...,...,...,...,...,...
254,254,MongoDB plummets nearly 27% for worst day ever...,MongoDB shares cratered more than 26.9% for th...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/mongodb-shares...,MongoDB,[-6.592944],[-6.592944],MongoDB plummets nearly 27% for worst day ever...
256,256,Analysts are upping bets on these stocks as th...,There is a small cohort of stocks that have dr...,2025-03-29 00:00:00,https://www.cnbc.com/2025/03/29/analysts-are-u...,Nvidia,[-11.728099],[-11.728099],Analysts are upping bets on these stocks as th...
257,257,Analysts are upping bets on these stocks as th...,There is a small cohort of stocks that have dr...,2025-03-29 00:00:00,https://www.cnbc.com/2025/03/29/analysts-are-u...,Warner Bros. Discovery,[-11.728099],[-11.728099],Analysts are upping bets on these stocks as th...
258,258,Analysts are upping bets on these stocks as th...,There is a small cohort of stocks that have dr...,2025-03-29 00:00:00,https://www.cnbc.com/2025/03/29/analysts-are-u...,Intel,[-11.728099],[-11.728099],Analysts are upping bets on these stocks as th...


In [152]:
#df.apply(lambda x: calculate_volume_ratio([n_to_t(x['tickers']), x['publish_date']]), axis=1)
df['ticks'] = df['tickers'].apply(lambda x: [x]).apply(lambda x: str(n_to_t(x))[2:-2])
df['ticks']

ser = df.apply(lambda x: calculate_volume_ratio([x['ticks'], x['publish_date']]), axis=1)

  return name_to_ticker.loc[x]['Column2'][-1]
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['RPAY']: YFPricesMissingError('possibly delisted; no price data found  (60m 2021-02-15 -> 2021-03-18) (Yahoo error = "1h data not available for startTime=1613365200 and endTime=1616040000. The requested range must be within the last 730 days.")')
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UPST']: YFPricesMissingError('possibly delisted; no price data found  (60m 2021-02-15 -> 2021-03-18) (Yahoo error = "1h data not available for startTime=1613365200 and endTime=1616040000. The requested range must be within the last 730 days.")')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[***

In [118]:
from transformers import pipeline
pipe1 = pipeline("text-classification", model="ProsusAI/finbert")


def finbert(text):
        
    text = text[:1500]
    d = pipe1(text)[0]
    if (d['label']=='neutral'):
        return 0
    elif (d['label']=='positive'):
        return d['score']
    else:
        return -d['score']

Device set to use mps:0


In [153]:
df['vol_prop'] = ser

df = df.dropna()

df['finbert_score'] = df['title'].apply(finbert)

df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['finbert_score'] = df['title'].apply(finbert)


Unnamed: 0.1,Unnamed: 0,title,text,publish_date,url,tickers,predict,percentpricechanges,headline,ticks,vol_prop,finbert_score
20,20,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Amazon,[-2.4287596],[-2.4287596],Costco checks plenty of key boxes for investor...,AMZN,1.462312,0.000000
21,21,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Costco Wholesale,[-2.4287596],[-2.4287596],Costco checks plenty of key boxes for investor...,COST,1.779808,0.000000
24,24,Broadcom shares soar 16% as earnings top estim...,Broadcom reported first-quarter earnings on Th...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/broadcom-avgo-...,Broadcom,[1.9902283],[1.9902283],Broadcom shares soar 16% as earnings top estim...,AVGO,2.899611,0.929990
28,28,MongoDB shares sink after company issues weak ...,MongoDB shares sank 16% in extended trading We...,2025-03-05 00:00:00,https://www.cnbc.com/2025/03/05/mongodb-mdb-q4...,MongoDB,[-0.9561062],[-0.9561062],MongoDB shares sink after company issues weak ...,MDB,3.254478,-0.943950
29,29,Bitcoin miners stockpile coins to ride out pro...,"Roula Khalaf, Editor of the FT, selects her fa...",2025-01-07 04:00:27.594000+00:00,https://www.ft.com/content/0cfbe43c-7c2a-40bf-...,Riot,[-7.6308546],[-7.6308546],Bitcoin miners stockpile coins to ride out pro...,RIOT,0.734256,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
223,223,"American Eagle says consumer is slowing down, ...",American Eagle warned investors on Wednesday t...,2025-03-12 00:00:00,https://www.cnbc.com/2025/03/12/american-eagle...,banner,[4.728269],[4.728269],"American Eagle says consumer is slowing down, ...",BANR,1.492731,-0.950710
232,232,FedEx cuts full-year results forecast on 'unce...,A FedEx plane prepares to leave the FedEx Carg...,2025-03-20 00:00:00,https://www.cnbc.com/2025/03/20/fedex-cuts-ful...,Amazon.com,[-5.312107],[-5.312107],FedEx cuts full-year results forecast on 'unce...,AMZN,1.035861,-0.958023
236,236,Macy's turnaround hinges on revamping some sto...,Macy's delivered another quarter of mixed resu...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/macys-m-q4-202...,banner,[-1.0178545],[-1.0178545],Macy's turnaround hinges on revamping some sto...,BANR,1.732528,0.575262
237,237,CoreWeave CEO says debt is 'the fuel for this ...,CoreWeave CEO Michael Intrator unpacked the cl...,2025-03-28 00:00:00,https://www.cnbc.com/2025/03/28/coreweave-ceo-...,Nvidia,[3.101265],[3.101265],CoreWeave CEO says debt is 'the fuel for this ...,NVDA,0.840527,0.000000


In [182]:
df['predict'] = pipeline.predict(df.loc[:, ['title', 'text', 'vol_prop', 'finbert_score']])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predict'] = pipeline.predict(df.loc[:, ['title', 'text', 'vol_prop', 'finbert_score']])


Unnamed: 0.1,Unnamed: 0,title,text,publish_date,url,tickers,predict,percentpricechanges,headline,ticks,vol_prop,finbert_score
20,20,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Amazon,-0.047647,[-2.4287596],Costco checks plenty of key boxes for investor...,AMZN,1.462312,0.000000
21,21,Costco checks plenty of key boxes for investor...,Costco on Thursday reported another sturdy — i...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/costco-checks-...,Costco Wholesale,-0.055805,[-2.4287596],Costco checks plenty of key boxes for investor...,COST,1.779808,0.000000
24,24,Broadcom shares soar 16% as earnings top estim...,Broadcom reported first-quarter earnings on Th...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/broadcom-avgo-...,Broadcom,0.283536,[1.9902283],Broadcom shares soar 16% as earnings top estim...,AVGO,2.899611,0.929990
28,28,MongoDB shares sink after company issues weak ...,MongoDB shares sank 16% in extended trading We...,2025-03-05 00:00:00,https://www.cnbc.com/2025/03/05/mongodb-mdb-q4...,MongoDB,0.020575,[-0.9561062],MongoDB shares sink after company issues weak ...,MDB,3.254478,-0.943950
29,29,Bitcoin miners stockpile coins to ride out pro...,"Roula Khalaf, Editor of the FT, selects her fa...",2025-01-07 04:00:27.594000+00:00,https://www.ft.com/content/0cfbe43c-7c2a-40bf-...,Riot,-0.255700,[-7.6308546],Bitcoin miners stockpile coins to ride out pro...,RIOT,0.734256,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
223,223,"American Eagle says consumer is slowing down, ...",American Eagle warned investors on Wednesday t...,2025-03-12 00:00:00,https://www.cnbc.com/2025/03/12/american-eagle...,banner,0.311031,[4.728269],"American Eagle says consumer is slowing down, ...",BANR,1.492731,-0.950710
232,232,FedEx cuts full-year results forecast on 'unce...,A FedEx plane prepares to leave the FedEx Carg...,2025-03-20 00:00:00,https://www.cnbc.com/2025/03/20/fedex-cuts-ful...,Amazon.com,-0.855974,[-5.312107],FedEx cuts full-year results forecast on 'unce...,AMZN,1.035861,-0.958023
236,236,Macy's turnaround hinges on revamping some sto...,Macy's delivered another quarter of mixed resu...,2025-03-06 00:00:00,https://www.cnbc.com/2025/03/06/macys-m-q4-202...,banner,0.471154,[-1.0178545],Macy's turnaround hinges on revamping some sto...,BANR,1.732528,0.575262
237,237,CoreWeave CEO says debt is 'the fuel for this ...,CoreWeave CEO Michael Intrator unpacked the cl...,2025-03-28 00:00:00,https://www.cnbc.com/2025/03/28/coreweave-ceo-...,Nvidia,0.234955,[3.101265],CoreWeave CEO says debt is 'the fuel for this ...,NVDA,0.840527,0.000000


In [36]:
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import pytz

def round_to_nearest_past_hour(dt):
    """
    Rounds the given datetime to the nearest past hour.
    If outside market hours (9:30 AM - 4:00 PM EST) or on a weekend, return None.
    Assumes the datetime is in EST.
    """
    # Check if the day is a weekend (Saturday = 5, Sunday = 6)
    if dt.weekday() >= 5:  # Saturday (5) or Sunday (6)
        return None

    # Define market open and close times in EST
    market_open = dt.replace(hour=9, minute=30, second=0, microsecond=0)
    market_close = dt.replace(hour=16, minute=0, second=0, microsecond=0)

    minutes = (dt.minute // 30) * 30
    rounded_dt = dt.replace(minute=minutes, second=0, microsecond=0)

    # If it's before market open or after market close, return None
    if (rounded_dt==None):
    
        df = yf.download(ticker, start=rounded_dt.date() - timedelta(days=7), end=(rounded_dt).date(), interval='60m')

        # If data is available for the rounded date, return the rounded datetime
        if not df.empty:
            return rounded_dt

        # If no data for the rounded date, get last month's data to find the closest day
        start_date = (rounded_dt - timedelta(days=30)).strftime("%Y-%m-%d")
        df_last_month = yf.download(ticker, start=start_date, end=rounded_dt.strftime("%Y-%m-%d"))

        # If no data available for the last month, return None
        if df_last_month.empty:
            return None

        # Find the closest available trading day to the specified date
        closest_day = df_last_month[df_last_month.index <= rounded_dt].tail(1)
        print(closest_day)

        if closest_day.empty:
            return None

        # Return the datetime of the closest available trading day
        return closest_day.index[-1]

    return rounded_dt

def get_stock_data(ticker, start, end, interval="60m"):
    """Fetch stock data from Yahoo Finance."""
    #print(start, end)
    return yf.download(ticker, start=start, end=end, interval=interval)

def calculate_volume_ratio(lst):
    try:
        ticker = lst[0]
        
        dt = datetime.strptime(lst[1][:19], "%Y-%m-%d %H:%M:%S")
        """Calculate today's volume spike ratio based on historical data."""
        rounded_dt = round_to_nearest_past_hour(dt)
        if rounded_dt is None:
            return None
        
        # Define the time range
        start_of_day = rounded_dt.replace(hour=9, minute=30)
        start_date = (rounded_dt - timedelta(days=31)).date()
        end_date = rounded_dt.date()
        
        # Fetch historical data
        df = get_stock_data(ticker, start=start_date, end=end_date)
        #print(df)
        #print(df.head())
        if df.empty:
            return None
        
        # Ensure DatetimeIndex and filter market hours
        #df.index = df.index.tz_localize(None)
        #df = df.between_time("09:30", rounded_dt.strftime("%H:%M"))
        
        
        # Calculate today's cumulative volume till rounded time
        #print('Index dated : ', df.index.date)
        #print(df.index.date)
        #print('Round', rounded_dt.date())
        try:
            today_volume = yf.download(ticker, start=rounded_dt.date(), end=rounded_dt.date() + timedelta(days=1))['Volume'].iloc[0].values[0]
        except:
            return None
        #today_volume = df.loc[df.index.date == rounded_dt.date()]["Volume"].sum()
        #print(today_volume)
        
        # Calculate average volume from 9:30 AM to rounded time over last month
        #print(df)
        #print(df)
        past_volumes = []
        for date in df.index.date:
            if date < rounded_dt.date():
                past_volumes.append(df[df.index.date == date]["Volume"].sum())
            #print(past_volumes)
        
        avg_past_volume = sum(past_volumes) / len(past_volumes) if past_volumes else None
        avg_past_volume = avg_past_volume.iloc[0]
        #print(avg_past_volume)
        
        # Compute volume spike ratio
        if avg_past_volume and avg_past_volume > 0:
            #print('Today Volume : ', today_volume)
            return today_volume / avg_past_volume
        return None
    
    except:
        return None
    
#@lru_cache(maxsize=None)
def price_change(lst):
    try:
        ticker = lst[0]
        dt = datetime.strptime(lst[1][:19], "%Y-%m-%d %H:%M:%S")
        rounded_dt = round_to_nearest_past_hour(dt)
        if rounded_dt is None:
            return None
        dld = yf.download(ticker, start=rounded_dt.date(), end=rounded_dt.date() + timedelta(days=1))
        today_price_change = dld['Close'].iloc[0].values[0] - dld['Open'].iloc[0].values[0]
        return today_price_change / dld['Open'].iloc[0].values[0]
    except:
        return None



ticker = "AMZN"
dt = '2025-01-14 2:41:18'
ratio = calculate_volume_ratio([ticker, dt])
print(price_change([ticker, dt]))
print("Volume Ratio:", ratio)

#print(round_to_nearest_past_hour(datetime(2024, 3, 23, 18, 38)))
#x = round_to_nearest_past_hour(datetime(2024, 3, 23, 18, 38))


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

-0.012157539035060881
Volume Ratio: 1.4409292992235434





In [196]:
import joblib
import torch
from transformers import BertTokenizer, BertModel
from sklearn.base import BaseEstimator, TransformerMixin

# Re-define BERTEmbeddingTransformer before loading pickle

big_events = {
    "war": {"defense": 1, "supply_chain": -1},
    "drought": {"agriculture": -1},
    "chip shortage": {"tech": -1},
    "cyclone": {"insurance": -1, "infrastructure": -1},
    "terror attack": {"defense": 1},
    "high interest rates": {"finance": -1, "real_estate": -1},
    "low interest rates": {"finance": 1, "real_estate": 1}
}

# List of industries (sectors) we care about.
sectors = ["defense", "supply_chain", "agriculture", "tech", "insurance", "infrastructure", "finance", "real_estate"]

# Industry to stock mapping.
industry_to_stock = {
    "defense": {"LMT": 1, "BA": 1},             # e.g., Lockheed Martin, Boeing
    "supply_chain": {"UPS": -1, "FDX": -1},       # e.g., UPS, FedEx
    "agriculture": {"DE": -1},                    # e.g., Deere & Co.
    "tech": {"AAPL": -1, "GOOGL": -1, "MSFT": -1}, # e.g., Apple, Google, Microsoft
    "insurance": {"AIG": -1},                     # e.g., AIG
    "infrastructure": {"CAT": -1},                # e.g., Caterpillar
    "finance": {"JPM": -1},                       # e.g., JPMorgan
    "real_estate": {"AMT": 1}                     # e.g., American Tower
}


nlp_event = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp_event.vocab, attr="LOWER")
event_phrases = list(big_events.keys())
patterns = [nlp_event(text) for text in event_phrases]
matcher.add("BIG_EVENT", patterns)

def extract_events(text):
    """
    Extract all big event keywords found in the text.
    Returns a list of lowercase event strings.
    """
    doc = nlp_event(text)
    matches = matcher(doc)
    detected = set()
    for match_id, start, end in matches:
        span = doc[start:end]
        detected.add(span.text.lower())
    return list(detected)

class BERTEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased'):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None

    def load_model(self):
        if self.tokenizer is None or self.model is None:
            self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
            self.model = BertModel.from_pretrained(self.model_name)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.load_model()
        inputs = self.tokenizer(X.tolist(), padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()
    
class IndustryToStockTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, big_events, sectors, industry_to_stock):
        self.big_events = big_events
        self.sectors = sectors
        self.industry_to_stock = industry_to_stock
        # Get a sorted list of all stocks in the mapping.
        self.stocks = sorted({stock for effects in industry_to_stock.values() for stock in effects})
        # Build a mapping matrix (n_sectors x n_stocks)
        self.mapping_matrix = np.zeros((len(self.sectors), len(self.stocks)))
        for i, sector in enumerate(self.sectors):
            if sector in self.industry_to_stock:
                for j, stock in enumerate(self.stocks):
                    self.mapping_matrix[i, j] = self.industry_to_stock[sector].get(stock, 0)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X is expected to be a pandas Series (text field, e.g., article title)
        event_effects = []
        for text in X:
            events = extract_events(text)
            # Initialize an effect vector for industries.
            vector = np.zeros(len(self.sectors))
            for event in events:
                if event in self.big_events:
                    effects = self.big_events[event]
                    for i, sector in enumerate(self.sectors):
                        if sector in effects:
                            vector[i] += effects[sector]
            event_effects.append(vector)
        event_effects = np.array(event_effects)  # shape: (n_samples, n_sectors)
        # Map the industry-level signal to stock-level signal:
        # (n_samples, n_sectors) dot (n_sectors, n_stocks) -> (n_samples, n_stocks)
        stock_effects = event_effects.dot(self.mapping_matrix)
        return stock_effects

# ✅ Now load the pipeline AFTER defining the class
pipeline = joblib.load('end_to_end_stock_event_model.pkl')
print("Pipeline loaded successfully!")


Pipeline loaded successfully!


In [52]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.language import Language
nlp = spacy.load('en_core_web_sm')
from functools import lru_cache

df = pd.read_excel('stocksGood.xlsx')
firms = list(df['Column3'])

def better_name(firm_name):
  return firm_name.replace("Inc.", "").replace('Holdings', '').replace('Corp.', '').replace('Solutions','').replace('Services','').replace("Corporation", "").replace(",", "").replace("PLC", "").replace("Ltd.", "").replace('Platforms', '').replace('News','').replace('Institution','').replace('research','').replace('Replace','').replace('news','').replace('advantage','').replace('popular','').strip().lower()

name_to_ticker = df.assign(Column4 = df['Column3'].apply(better_name)).set_index('Column4')

def n_to_t(lst):
  return_lst = []
  lst = [x.lower() for x in lst]
  def helper(x):
    try:
      a = name_to_ticker.loc[x]['Column2']
      if (isinstance(a, str)):
        if ((a!='RSSS') & (a!='BPOP') & (a!='PINC') & (a!='AWRE') & (a!='NICE') & (a!='ADV') & (a!='BKNG') & (a!='DALN') & (a!='STHO')):
          return a
        else:
          return False
      else:
        return name_to_ticker.loc[x]['Column2'][-1]
    except:
      return False

  return_lst = list(map(lambda x: helper(x), lst))
  return return_lst
  

def remove_rubbish(df):
    def remove_false(lst):
        try:
            if (False in lst):
                while False in lst:
                    lst.remove(False)
                return lst
            else:
                return lst
        except:
            return lst
    try:
        df['ticker'] = df['ticker'].apply(remove_false)
        df = df.dropna(axis='index', how='any')
        return df
    except:
        return None
    

company_names = firms
def preprocess_company_names(company_names):
    processed_names = []
    for name in company_names:
        core_name = name.replace("Inc.", "").replace('Holdings', '').replace('Corp.', '').replace('Solutions','').replace('Services','').replace("Corporation", "").replace(",", "").replace("PLC", "").replace("Ltd.", "").replace('Platforms', '').replace('News','').replace('Institution','').replace('research','').replace('Replace','').replace('news','').strip()
        processed_names.append(core_name)
    try:
        return list(set(processed_names)) 
    except:
        return None
core_company_names = preprocess_company_names(company_names)


matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

patterns = list(nlp.pipe(core_company_names))
matcher.add("COMPANY", patterns)

@Language.component("company_ner_component")
def company_ner_component(doc):
    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        span = Span(doc, start, end, label="COMPANY")
        spans.append(span)

    doc.ents = spacy.util.filter_spans(spans)
    return doc

nlp.add_pipe("company_ner_component", last=True)

def find_companies(text):
  doc = nlp(text)
  detected_firms = set()
  for ent in doc.ents:
    if (ent.label_ == "COMPANY"):
      detected_firms.add(ent.text)
  if (len(detected_firms) == 0):
    return None
  return list(detected_firms)

find_companies('Alphabet and Apple popular and Amazon was sued'.replace('popular','').replace('advantage', '').replace('root', ''))


['Alphabet', 'Apple', 'Amazon']