In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GlobalMaxPooling1D, Dense
from transformers import pipeline

# Load spaCy model
import en_core_web_sm
nlp = en_core_web_sm.load()

# Load dataset
data = pd.read_csv("bbc_news.csv", usecols=["Text", "Category"])

# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'http\S+', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

data['Cleaned_Text'] = data['Text'].apply(preprocess_text)

# Visualize class distribution
sns.countplot(y=data["Category"], order=data["Category"].value_counts().index)
plt.title("Distribution of Articles")
plt.show()

# Classification - TF-IDF + Naive Bayes / SVM
X_train, X_test, y_train, y_test = train_test_split(
    data['Cleaned_Text'], data['Category'], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

# SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

# LSTM Model (Deep Learning)
max_words = 5000
max_len = 500
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(X_train), maxlen=max_len
)
X_test_seq = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(X_test), maxlen=max_len
)

model = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=True),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(len(data["Category"].unique()), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(
    X_train_seq, y_train.astype('category').cat.codes,
    epochs=3, batch_size=32,
    validation_data=(X_test_seq, y_test.astype('category').cat.codes)
)

# Evaluate LSTM
y_pred_lstm = model.predict(X_test_seq)
y_pred_lstm_classes = np.argmax(y_pred_lstm, axis=1)
print("LSTM Classification Report:")
print(classification_report(y_test.astype('category').cat.codes, y_pred_lstm_classes))

# Summarization using Hugging Face Transformers
summarizer = pipeline("summarization")
example_text = data.iloc[0]['Text']

# Extractive summary (first 50 words)
extractive_summary = ' '.join(example_text.split()[:50])

# Abstractive summary
abstractive_summary = summarizer(example_text, max_length=50, min_length=10, do_sample=False)[0]['summary_text']

print(f"Original Text:\n{example_text}\n")
print(f"Extractive Summary:\n{extractive_summary}\n")
print(f"Abstractive Summary:\n{abstractive_summary}\n")