In [None]:
from datasets import load_dataset

# Load full dataset
dataset = load_dataset("kdave/Indian_Financial_News", split="train")

# Save to CSV
dataset.to_csv("financial_news_full.csv")

print(f"✅ Done! Saved {len(dataset)} rows")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

training_data_26000.csv:   0%|          | 0.00/115M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26961 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

✅ Done! Saved 26961 rows


In [None]:
# Install dependencies (run this in Colab)
!pip install requests beautifulsoup4 tqdm

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import random, time, json

# Function to extract publication date
def get_publication_date(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, timeout=10, headers=headers)
        if response.status_code != 200:
            return None

        soup = BeautifulSoup(response.text, "html.parser")

        # Common meta tags for published date
        meta_props = [
            ("meta", {"property": "article:published_time"}),
            ("meta", {"name": "publish-date"}),
            ("meta", {"name": "date"}),
            ("meta", {"property": "og:published_time"}),
        ]
        for tag, attrs in meta_props:
            meta = soup.find(tag, attrs=attrs)
            if meta and meta.get("content"):
                return meta["content"].split("T")[0]

        # Look for <time datetime="">
        time_tag = soup.find("time")
        if time_tag and time_tag.get("datetime"):
            return time_tag["datetime"].split("T")[0]

        # JSON-LD structured data
        scripts = soup.find_all("script", type="application/ld+json")
        for script in scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict) and "datePublished" in data:
                    return data["datePublished"].split("T")[0]
                elif isinstance(data, list):
                    for item in data:
                        if isinstance(item, dict) and "datePublished" in item:
                            return item["datePublished"].split("T")[0]
            except:
                continue

        return None
    except:
        return None

# Load your dataset
df = pd.read_csv("financial_news_full.csv")

# Function to process in parallel
def fetch_dates_parallel(urls, max_workers=30):
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_publication_date, url): url for url in urls}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching dates"):
            url = futures[future]
            try:
                results[url] = future.result()
            except Exception:
                results[url] = None
            # Random short sleep to avoid being blocked
            time.sleep(random.uniform(0.05, 0.2))
    return results

# Run on all URLs (this will take ~30-40 minutes instead of hours)
results = fetch_dates_parallel(df["URL"].tolist(), max_workers=30)

# Add to DataFrame
df["published_date"] = df["URL"].map(results)

# Save updated dataset
df.to_csv("financial_news_with_dates.csv", index=False)
print("✅ Done! File saved as financial_news_with_dates.csv")




Fetching dates: 100%|██████████| 26961/26961 [1:25:15<00:00,  5.27it/s]


✅ Done! File saved as financial_news_with_dates.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "/content/sample_data/financial_news_sorted.csv"
df = pd.read_csv(file_path)

# Convert published_date to datetime
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

# Keep only rows where date is between 2018 and 2020 (inclusive)
df = df[(df['published_date'].dt.year >= 2018) & (df['published_date'].dt.year <= 2020)]

# Remove duplicate rows
df_cleaned = df.drop_duplicates()

# Save cleaned dataset
output_file = "financial_news_cleaned.csv"
df_cleaned.to_csv(output_file, index=False)

print("Before cleaning:", df.shape)
print("After removing duplicates & filtering by date:", df_cleaned.shape)
print(f"Cleaned file saved as: {output_file}")


Before cleaning: (19273, 5)
After removing duplicates & filtering by date: (9434, 5)
Cleaned file saved as: financial_news_cleaned.csv


In [None]:
import pandas as pd

# Load datasets (update with your actual file names/paths)
gold = pd.read_csv("/content/sample_data/Gold Price.csv")
silver = pd.read_csv("/content/sample_data/LBMA-SILVER.csv")

# Ensure date column is in datetime format (update 'Date' if column has different name)
gold['Date'] = pd.to_datetime(gold['Date'])
silver['Date'] = pd.to_datetime(silver['Date'])

# Filter for 2018–2020
gold = gold[(gold['Date'] >= "2018-01-01") & (gold['Date'] <= "2020-12-31")]
silver = silver[(silver['Date'] >= "2018-01-01") & (silver['Date'] <= "2020-12-31")]

# Merge datasets on Date (inner join to keep only matching dates)
combined = pd.merge(gold, silver, on="Date", suffixes=('_Gold', '_Silver'))

# Save combined dataset
combined.to_csv("gold_silver_2018_2020.csv", index=False)

print("Combined dataset saved as gold_silver_2018_2020.csv")


Combined dataset saved as gold_silver_2018_2020.csv


In [None]:
import pandas as pd

# Load datasets (update file names/paths accordingly)
nifty = pd.read_csv("/content/sample_data/Nifty 50 Historical Data.csv")
sensex = pd.read_csv("/content/sample_data/BSE Sensex 30 Historical Data.csv")

# Ensure date column is in datetime format (change 'Date' if your column has another name)
nifty['Date'] = pd.to_datetime(nifty['Date'])
sensex['Date'] = pd.to_datetime(sensex['Date'])

# Filter for 2018–2020 (same as you did earlier)
nifty = nifty[(nifty['Date'] >= "2018-01-01") & (nifty['Date'] <= "2020-12-31")]
sensex = sensex[(sensex['Date'] >= "2018-01-01") & (sensex['Date'] <= "2020-12-31")]

# Merge datasets on Date
combined = pd.merge(nifty, sensex, on="Date", suffixes=('_Nifty', '_Sensex'))

# Save merged dataset
combined.to_csv("nifty_sensex_2018_2020.csv", index=False)

print("Merged dataset saved as nifty_sensex_2018_2020.csv ✅")


Merged dataset saved as nifty_sensex_2018_2020.csv ✅


  sensex['Date'] = pd.to_datetime(sensex['Date'])


In [2]:
import pandas as pd

# Load the datasets
df1 = pd.read_csv('/content/sample_data/gold_silver_2018_2020.csv')
df2 = pd.read_csv('/content/sample_data/nifty_sensex_2018_2020.csv')

# Merge them horizontally (assuming a common column 'ID')
merged_df = pd.merge(df1, df2, on='Date')

# If there's no common column and you just want to concatenate side by side:
merged_df = pd.concat([df1, df2], axis=1)

# Save the merged dataset
merged_df.to_csv('merged_dataset_of_metals_and_stockmarket.csv', index=False)


In [3]:
import pandas as pd

# Load the merged dataset
merged_df = pd.read_csv('/content/merged_dataset_of_metals_and_stockmarket.csv')

# Create the title row
n_columns = merged_df.shape[1]
title_row = ["Metals"] * 10 + ["Stocks"] * (n_columns - 10)

# Convert the title row to a DataFrame
title_df = pd.DataFrame([title_row])

# Concatenate the title row and the actual dataset
final_df = pd.concat([title_df, merged_df], ignore_index=True)

# Save the new dataset to a CSV file
final_df.to_csv('final_dataset_with_title.csv', index=False, header=False)
