In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os

# Step 1: Ensure the dataset is loaded locally
def ensure_local_dataset(file_path, dataset):
    if not os.path.exists(file_path):
        print("Saving dataset locally...")
        with open(file_path, 'wb') as file:
            pickle.dump(dataset, file)
        print("Dataset saved locally.")
    else:
        print("Dataset already exists locally.")

# Load a sample dataset (replace with your dataset if needed)
def load_sample_dataset():
    return {
        'data': [
            "NASA launches new satellite.",
            "Apple announces new iPhone.",
            "Stock market hits record high.",
            "Scientists discover new species in Amazon.",
            "Football team wins championship."
        ],
        'target': [0, 1, 2, 3, 4],
        'target_names': ["Science", "Technology", "Finance", "Nature", "Sports"]
    }

# Path for the local dataset
local_dataset_path = "news_dataset.pkl"
data = load_sample_dataset()
ensure_local_dataset(local_dataset_path, data)

# Step 2: Load the dataset
def load_local_dataset(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

data = load_local_dataset(local_dataset_path)
X, y = data['data'], data['target']

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build the text processing and classification pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

# Step 5: Train the model
pipeline.fit(X_train, y_train)

# Step 6: Test the model
y_pred = pipeline.predict(X_test)

# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")
unique_classes = sorted(set(y_test))
print(classification_report(y_test, y_pred, target_names=[data['target_names'][i] for i in unique_classes], labels=unique_classes))

# Step 8: Make predictions on new data
def classify_new_article(article):
    category = pipeline.predict([article])
    return data['target_names'][category[0]]

# Example usage
new_article = "Scientists develop a new technology to fight climate change."
print("Predicted Category:", classify_new_article(new_article))


Saving dataset locally...
Dataset saved locally.
Accuracy: 0.00%

Classification Report:

              precision    recall  f1-score   support

  Technology       0.00      0.00      0.00       1.0

   micro avg       0.00      0.00      0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

Predicted Category: Nature


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
