In [1]:
!pip install pandas numpy datasets scikit-learn nltk

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def load_data():
    dataset = load_dataset("ashraq/financial-news-articles")
    df = pd.DataFrame(dataset['train'])
    return df

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def categorize(text):
    text = text.lower()
    if any(keyword in text for keyword in ['crypto', 'bitcoin', 'blockchain', 'ethereum', 'token']):
        return 'crypto'
    elif any(keyword in text for keyword in ['stock', 'share', 'market', 'index', 'nasdaq', 'dow']):
        return 'stocks'
    else:
        return 'other'

def preprocess_and_label(df):
    df['processed_text'] = df['text'].apply(preprocess_text)
    df['label'] = df['text'].apply(categorize)
    return df[['processed_text', 'label']]

In [5]:
# Load and preprocess the data
df = load_data()
processed_df = preprocess_and_label(df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/543 [00:00<?, ?B/s]

(…)-00000-of-00002-a3f58f0eb179f9ed.parquet:   0%|          | 0.00/238M [00:00<?, ?B/s]

(…)-00001-of-00002-50e0d6558d13575f.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/306242 [00:00<?, ? examples/s]

In [7]:
processed_df.tail()

Unnamed: 0,processed_text,label
306237,may updated minutes ago british gaming firm en...,other
306238,oslo may reuters norwegian government plans au...,other
306239,shanghai may prnewswire jmu limited company jm...,stocks
306240,adam jeffery cnbc tim cook ceo apple inc next ...,stocks
306241,may reuters wolford ag fosun industrial holdin...,stocks


In [8]:
# prompt: show me 5 from the above wih label crypto

crypto_df = processed_df[processed_df['label'] == 'crypto']
print(crypto_df.head(5))


                                        processed_text   label
59   jan reuters diaries please see us federal rese...  crypto
86   david z morris pm est coinmarketcap arguably p...  crypto
188  see tremendous potential blockchain industries...  crypto
238  january updated minutes ago live marketsclosin...  crypto
263  january updated minutes ago daily briefing bre...  crypto


In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(processed_df['processed_text'], processed_df['label'], test_size=0.2, random_state=42)


In [10]:
# Create and fit the TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [11]:
# Train the model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [12]:
# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      crypto       0.99      0.74      0.85      1195
       other       0.88      0.94      0.91     26510
      stocks       0.94      0.90      0.92     33544

    accuracy                           0.91     61249
   macro avg       0.94      0.86      0.89     61249
weighted avg       0.92      0.91      0.91     61249



In [13]:
# Save the model, vectorizer, and label encoder
def save_model(model, vectorizer, filename_prefix='financial_news'):
    model_filename = f'{filename_prefix}_model.pkl'
    vectorizer_filename = f'{filename_prefix}_vectorizer.pkl'

    pickle.dump(model, open(model_filename, 'wb'))
    pickle.dump(vectorizer, open(vectorizer_filename, 'wb'))

    print(f"Model saved as {model_filename}")
    print(f"Vectorizer saved as {vectorizer_filename}")

save_model(model, tfidf)

Model saved as financial_news_model.pkl
Vectorizer saved as financial_news_vectorizer.pkl


In [14]:
# Load the model and vectorizer
def load_model(filename_prefix='financial_news'):
    model_filename = f'{filename_prefix}_model.pkl'
    vectorizer_filename = f'{filename_prefix}_vectorizer.pkl'

    loaded_model = pickle.load(open(model_filename, 'rb'))
    loaded_vectorizer = pickle.load(open(vectorizer_filename, 'rb'))

    return loaded_model, loaded_vectorizer


In [15]:
# Function to predict category
def predict_category(text, model, vectorizer):
    processed_text = preprocess_text(text)
    text_vector = vectorizer.transform([processed_text])
    prediction = model.predict(text_vector)
    return prediction[0]

In [16]:
# Test the loaded model
loaded_model, loaded_vectorizer = load_model()

test_texts = [
    "Bitcoin price surges to new all-time high as institutional investors pile in",
    "Dow Jones Industrial Average closes above 35,000 for the first time",
    "Central banks consider issuing digital currencies in response to cryptocurrency growth",
    "Tech stocks lead market rally amid strong earnings reports",
    "NFT marketplace OpenSea raises $100 million in Series B funding"
]

for text in test_texts:
    print(f"Text: '{text}'")
    print(f"Predicted category: {predict_category(text, loaded_model, loaded_vectorizer)}\n")


Text: 'Bitcoin price surges to new all-time high as institutional investors pile in'
Predicted category: crypto

Text: 'Dow Jones Industrial Average closes above 35,000 for the first time'
Predicted category: stocks

Text: 'Central banks consider issuing digital currencies in response to cryptocurrency growth'
Predicted category: crypto

Text: 'Tech stocks lead market rally amid strong earnings reports'
Predicted category: stocks

Text: 'NFT marketplace OpenSea raises $100 million in Series B funding'
Predicted category: stocks

