In [17]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy

In [20]:
# NLP tools initialization
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

In [28]:
FINANCIAL_STOP_WORDS = """
    a about across after afterwards again all almost alone along
    already also although always am among amongst amount an and another any anyhow
    anyone anything anyway anywhere are as at

    be became because become becomes becoming been before beforehand behind
    being beside besides between both but by

    can cannot ca could

    did do does doing done due during

    each either else elsewhere empty enough even ever every
    everyone everything everywhere except

    few for former formerly from full
    further

    give

    had has have he hence her here hereafter hereby herein hereupon hers herself
    him himself his how however

    i if in indeed into is it its itself

    keep

    last latter latterly least less

    just

    made make many may me meanwhile might mine more moreover most mostly much
    must my myself

    name namely neither never nevertheless next no nobody none noone nor not
    nothing now nowhere

    of often on once only onto or other others otherwise our ours ourselves
    out own

    part per perhaps please put

    quite

    rather re really regarding

    same say see seem seemed seeming seems serious several she should show side
    since so some somehow someone something sometime sometimes somewhere still such

    take than that the their them themselves then thence there thereafter
    thereby therefore therein thereupon these they third this those though through
    throughout thus to together too toward towards

    under until unless upon us used using

    various very via was we well were what whatever when whence whenever where
    whereafter whereas whereby wherein whereupon wherever whether which while
    whither who whoever whole whom whose why will with within without would

    yet you your yours yourself yourselves
    """

CONTRACTIONS = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]

# Expanded list of positive words specific to financial news (preprocessed)
POSITIVE_WORDS = [
        "upturn", "bullish", "rally", "advance", "expansion", "breakthrough", 
        "record high", "lucrative", "prosperity", "fortune", "thrive", "inflow", 
        "rebound", "beat", "strategic alliance", "upbeat outlook", 
        "milestone", "partnership", "share buyback", "dividend raise", "ipo success", 
        "profit surge", "rise", "soar", "bull", "raise", "generate", "noteworthy",
        "surge", "radar", "phenomenal", "earn", "trend stock"
    ]

# Expanded list of negative words specific to financial news (preprocessed)
NEGATIVE_WORDS = [
        "decline", "fall", "bearish", "plunge", "slump", "downward", "concern", 
        "downturn", "outflow", "stagnation", "layoff", "bankruptcy", "underperform", 
        "volatility", "selloff", "sell-off", "downgrade", "recession fear", "shortfall", 
        "plummet", "bear market", "drop in value", "bankruptcy proceeding", 
        "cut forecast", "miss estimate", "downward pressure", "production cut", 
        "regulatory setback", "settlement charge", "supply chain disruption", "bear",
        "stock down", "step down", "stock falter"
    ]

POSITIVE_PERCENTAGE_WORDS = ["up", "surge", "rise", "add", "soar", "jump", "climb", "rocket", "race ahead", "yield over", "move"]

NEGATIVE_PERCENTAGE_WORDS = ["down", "fall", "decline", "decrease", "plunge", "drop", "dip", "-", "slide"]

INPUT_FILE_PATH = r"C:\Users\48531\Downloads\Neutral_news2_sample.csv"

OUTPUT_FILE_PATH = r"C:\Users\48531\Desktop\SentimentBIIB2.txt"

In [22]:
def create_stop_words():

    # Converting a string into a list of words and creating a set
    stop_words = set(FINANCIAL_STOP_WORDS.split())

    # Adding the most common forms of contractions
    stop_words.update(CONTRACTIONS)

    # Adding contractions with typographic apostrophes
    for apostrophe in ("‘", "’"):
        for stopword in CONTRACTIONS:
            stop_words.add(stopword.replace("'", apostrophe))
    
    return stop_words

# Using the function
stop_words_set = create_stop_words()

In [23]:
def preprocess_text(text):
    text = text.lower()  # Changing to lowercase letters
    text = re.sub(r"[^-a-zA-Z0-9+%\s]", "", text)  # Removing unnecessary characters
    
    doc = nlp(text)  # Tokenization
    # Lemmatization and stop-words elimination
    lemmatized_text = " ".join([token.lemma_ for token in doc if token.lemma_ not in stop_words_set])  
    
    return lemmatized_text.strip()

In [24]:
# Sentiment calculation function
def calculate_sentiment_vader_custom(raw_news, preprocessed_news):
    # Base sentiment score from VADER on raw text (without preprocessing)
    sentiment_score = sia.polarity_scores(raw_news)['compound']
    
    # Adjusting sentiment score based on presence of custom positive and negative words (preprocessed)
    sentiment_score += 0.1 * sum(word in preprocessed_news for word in POSITIVE_WORDS)
    sentiment_score -= 0.2 * sum(word in preprocessed_news for word in NEGATIVE_WORDS)
    
    # Checking the context of percentage changes
    if any(word in preprocessed_news for word in POSITIVE_PERCENTAGE_WORDS) and '%' in preprocessed_news:
        sentiment_score += 0.3
    if any(word in preprocessed_news for word in NEGATIVE_PERCENTAGE_WORDS) and '%' in preprocessed_news:
        sentiment_score -= 0.4
    
  # Limiting the result to the range [-1, 1]
    return max(min(sentiment_score, 1), -1)

In [None]:
# Data loading
df = pd.read_csv(INPUT_FILE_PATH, delimiter=',', usecols=['news_header', 'news_date'])

In [None]:
# Applying text preprocessing function
df['news_header_preprocessed'] = df['news_header'].apply(preprocess_text)

# Applying the sentiment analysis function
df['sentiment_vader'] = df.apply(lambda row: calculate_sentiment_vader_custom(row['news_header'], row['news_header_preprocessed']), axis=1)

# Removing the 'news_header_preprocessed' column after applying the sentiment function
df.drop(columns=['news_header_preprocessed'], inplace=True)

# Converting news_date to datetime
df['news_date'] = pd.to_datetime(df['news_date'])

In [36]:
df

Unnamed: 0,news_header,sentiment_vader
0,Netflix (NFLX) Rises Higher Than Market: Key F...,0.1
1,3 ETFs That Have Soared Past the S&P 500 in th...,0.1
2,Nvidia CEO Jensen Huang Describes Why Business...,0.0
3,"The Zacks Analyst Blog Highlights IBM, Amazon....",0.0
4,Spotify Technology and Lamb Weston have been h...,-0.1
...,...,...
95,Stocks Mixed After This Morning's Fed-Friendly...,0.0
96,"Dow Movers: AAPL, MCD",0.0
97,"2 Stock-Split AI Stocks Up 455% and 1,150% in ...",-0.1
98,"Stock Market News for Sep 16, 2024",0.0


In [None]:
# Saving results to CSV file
df.to_csv(OUTPUT_FILE_PATH, sep='|')