In [3]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def load_dataset(data_dir):
    texts, labels = [], []
    categories = sorted(os.listdir(data_dir))   # label = folder name
    
    for label, category in enumerate(categories):
        category_path = os.path.join(data_dir, category)
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                texts.append(f.read())
                labels.append(label)
    
    return texts, labels, categories

data_dir = r"C:\Users\devan\AI-ML\3_NLP_Project_Document_Classification\Data"
texts, labels, categories = load_dataset(data_dir)

print("Categories:", categories)
print("Total samples:", len(texts))


Categories: ['business', 'entertainment', 'politics', 'sport', 'tech']
Total samples: 2225


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")   # fix your error
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()                                # lowercase
    text = re.sub(r"<.*?>", " ", text)                 # remove HTML tags
    text = re.sub(r"http\S+|www\S+", " ", text)        # remove URLs
    text = re.sub(r"[^a-z\s]", " ", text)              # keep only alphabets
    tokens = nltk.word_tokenize(text)                  # tokenize
    tokens = [t for t in tokens if t not in stop_words]# remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens] # lemmatize
    return " ".join(tokens)

cleaned_texts = [clean_text(t) for t in texts]

print("Before:", texts[0][:])
print("After:", cleaned_texts[0][:])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Before: Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sig

In [6]:
from sklearn.model_selection import train_test_split

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))



# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print("Vectorized train shape:", X_train_vec.shape)
print("Vectorized test shape:", X_test_vec.shape)


Train size: 1780
Test size: 445
Vectorized train shape: (1780, 5000)
Vectorized test shape: (445, 5000)


In [7]:
from tensorflow.keras.utils import to_categorical

num_classes = len(categories)
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout

MAX_VOCAB = 20000
MAX_LEN = 200
NUM_CLASSES = num_classes  # set this to your number of classes

model = Sequential()

# 1) Learn word embeddings
model.add(Embedding(input_dim=MAX_VOCAB,
                    output_dim=64,
                    input_length=MAX_LEN))

# 2) Convert sequence of word vectors -> single vector (average)
model.add(GlobalAveragePooling1D())

# 3) Basic feedforward classifier
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))

# 4) Output layer
model.add(Dense(NUM_CLASSES, activation="softmax"))

# IMPORTANT: choose loss based on label format
model.compile(
    loss="sparse_categorical_crossentropy",  # use this if labels are integers 0..NUM_CLASSES-1
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()




In [13]:
# model training code here
history = model.fit(
    X_train_vec, y_train_cat,
    validation_data=(X_test_vec, y_test_cat),
    epochs=10,         # train for 10 epochs
    batch_size=32,     # batch size
    verbose=1
)


Epoch 1/10


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 5), output.shape=(None, 5)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test_cat, axis=1)

print(classification_report(y_true, y_pred_classes, target_names=categories))

cm = confusion_matrix(y_true, y_pred_classes)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=categories, yticklabels=categories)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:
import json

# Save tokenizer
tokenizer_json = tokenizer.to_json()
with open("tokenizer.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

print("✅ Tokenizer saved as tokenizer.json")


✅ Tokenizer saved as tokenizer.json


In [None]:
# Save model
model.save("news_classifier.h5")
print("✅ Model saved as news_classifier.h5")




✅ Model saved as news_classifier.h5


In [None]:
def predict_category(text):
    clean = clean_text(text)
    seq = tokenizer.texts_to_sequences([clean])
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding="post")
    pred = model.predict(pad)
    return categories[pred.argmax()]

print(predict_category("The stock market is booming."))
print(predict_category("India won the cricket match."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
sport
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
sport
