In [1]:
!pip install wbdata transformers faiss-cpu rouge-score



In [9]:
import wbdata
import kagglehub
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import faiss
from rouge_score import rouge_scorer


In [5]:
# Define indicators
indicators = {
    "NY.GDP.MKTP.KD.ZG": "GDP Growth",
    "FP.CPI.TOTL.ZG": "Inflation",
    "NE.EXP.GNFS.CD": "Exports",
    "NE.IMP.GNFS.CD": "Imports",
    "SL.UEM.TOTL.ZS": "Unemployment",
}

# Define countries
countries = [
    "USA", "GBR", "DEU", "JPN",  # Developed Economies
    "CHN", "IND", "BRA", "ZAF",  # Emerging Markets
    "SGP", "MEX", "KOR"           # Trade-Dependent Economies
]

# Define time range
start_date = datetime(2005, 1, 1)
end_date = datetime(2012, 12, 31)

# Fetch data
data = wbdata.get_dataframe(indicators, country=countries, date=(start_date, end_date))

# Reset index for easier processing
data.reset_index(inplace=True)

# Display sample World Bank data
print("Sample World Bank Data:")
print(data.head())


Sample World Bank Data:
  country  date  GDP Growth  Inflation       Exports       Imports  \
0  Brazil  2012    1.921176   5.403499  2.928084e+11  3.263157e+11   
1  Brazil  2011    3.974423   6.636450  3.030166e+11  3.231448e+11   
2  Brazil  2010    7.528226   5.038727  2.400031e+11  2.629973e+11   
3  Brazil  2009   -0.125812   4.888035  1.808920e+11  1.876139e+11   
4  Brazil  2008    5.094195   5.678594  2.295170e+11  2.327318e+11   

   Unemployment  
0         7.251  
1         7.578  
2         8.424  
3         9.419  
4         8.268  


In [6]:
# Load Kaggle Financial Sentiment Analysis Dataset
data_path = kagglehub.dataset_download("sbhatti/financial-sentiment-analysis", force_download=True)

# Construct the file path directly using 'data.csv'
data_file = os.path.join(data_path, "data.csv")

# Read the CSV
df = pd.read_csv(data_file)


Downloading from https://www.kaggle.com/api/v1/datasets/download/sbhatti/financial-sentiment-analysis?dataset_version_number=4...


100%|██████████| 276k/276k [00:00<00:00, 30.8MB/s]

Extracting files...





In [11]:
def preprocess_text(text):
    return text.lower().strip()

df["cleaned_text"] = df["Sentence"].apply(preprocess_text)

financial_sentences = df["cleaned_text"].tolist()
sentiments = df["Sentiment"].tolist()

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(financial_sentences)

def sparse_retrieval(query, top_k=5):
    query_tfidf = vectorizer.transform([query])
    scores = cosine_similarity(query_tfidf, tfidf_matrix)[0]
    top_indices = scores.argsort()[-top_k:][::-1]
    return [(financial_sentences[i], sentiments[i], scores[i]) for i in top_indices]


queries = [
    "Stock prices increased significantly",
    "Company reported huge losses",
    "Economic growth forecast for next quarter",
    "High unemployment and rising inflation",
    "Positive outlook for technology sector"
]

# Retrieve and Display Results
for query in queries:
    print(f"\n🔍 Query: {query}")
    results = sparse_retrieval(query, top_k=3)
    for i, (sentence, sentiment, score) in enumerate(results):
        print(f"{i+1}. [{sentiment}] (Score: {score:.4f}) - {sentence}")


🔍 Query: Stock prices increased significantly
1. [neutral] (Score: 0.4360) - prices and delivery volumes of broadband products decreased significantly in 2005 .
2. [negative] (Score: 0.4360) - prices and delivery volumes of broadband products decreased significantly in 2005 .
3. [positive] (Score: 0.3512) - the company expects net sales to significantly increase from 2009 .

🔍 Query: Company reported huge losses
1. [neutral] (Score: 0.4946) - this would be a huge process .
2. [neutral] (Score: 0.3004) - a huge issue for us is the button placement .
3. [neutral] (Score: 0.2694) - the huge bridge girders will be delivered to the site from our plant in ylivieska , finland .

🔍 Query: Economic growth forecast for next quarter
1. [neutral] (Score: 0.4192) - the forecast for 2012 is 3.3 % .
2. [positive] (Score: 0.3627) - ruukki forecast a 15-20 % annual sales growth and a positive pretax result for 2010 .
3. [positive] (Score: 0.3463) - consumption is forecast to grow by about 2 % .

🔍 Que