In [None]:
!pip install scikit-learn




In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']  # Choose categories
news_data = fetch_20newsgroups(subset='all', categories=categories)

# Features and target
X = news_data.data  # Text data
y = news_data.target  # Categories

print(f"Categories: {news_data.target_names}")
print(f"Number of samples: {len(X)}")

Categories: ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
Number of samples: 3759


In [None]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# Transform the text data
X_tfidf = tfidf.fit_transform(X)

print(f"TF-IDF matrix shape: {X_tfidf.shape}")

TF-IDF matrix shape: (3759, 10000)


In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 3007
Testing samples: 752


In [None]:
# Initialize the model
model = LogisticRegression(max_iter=10000)

# Train the model
model.fit(X_train, y_train)

print("Model training completed.")

Model training completed.


In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

print("Predictions completed.")

Predictions completed.


In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=news_data.target_names))

Accuracy: 0.9587765957446809
Classification Report:
                        precision    recall  f1-score   support

           alt.atheism       0.99      0.93      0.96       175
         comp.graphics       0.93      1.00      0.97       200
               sci.med       0.97      0.96      0.96       200
soc.religion.christian       0.95      0.94      0.94       177

              accuracy                           0.96       752
             macro avg       0.96      0.96      0.96       752
          weighted avg       0.96      0.96      0.96       752



In [None]:
# Test with custom data
test_text = ["The advancements in medical technology are astounding."]
test_text_tfidf = tfidf.transform(test_text)
prediction = model.predict(test_text_tfidf)

print(f"Prediction: {news_data.target_names[prediction[0]]}")

Prediction: sci.med


In [None]:
import pickle

# Save the model
with open('news_classifier_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

In [None]:
# Test with custom data
test_text = ["Rendering 3D environments in real time is becoming faster with the latest GPU"]
test_text_tfidf = tfidf.transform(test_text)
prediction = model.predict(test_text_tfidf)

print(f"Prediction: {news_data.target_names[prediction[0]]}")

Prediction: comp.graphics


In [None]:
print("Categories included in the model:")
print(news_data.target_names)

Categories included in the model:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [None]:
categories = [
    'alt.atheism',
    'comp.graphics',
    'sci.med',
    'soc.religion.christian',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'comp.windows.x',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.space',
    'misc.forsale',
    'talk.politics.misc',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.religion.misc',
    'rec.sport.soccer',
    'rec.travel',
    'comp.lang.c',
    'comp.lang.java',
    'sci.philosophy',
    'soc.history'
]

In [None]:
# Test with custom data
test_text = ["Angru Teens paralyze Bangladesh Capital for the Protest"]
test_text_tfidf = tfidf.transform(test_text)
prediction = model.predict(test_text_tfidf)

print(f"Prediction: {news_data.target_names[prediction[0]]}")

Prediction: sci.med


In [None]:
categories = [
    'alt.atheism',
    'comp.graphics',
    'sci.med',
    'soc.religion.christian',
    'talk.politics.misc',
    'talk.politics.guns',
    'talk.politics.mideast'
]

# Reload the dataset with new categories
news_data = fetch_20newsgroups(subset='all', categories=categories)

# Update features and labels
X = news_data.data
y = news_data.target

print("Updated Categories:")
print(news_data.target_names)

Updated Categories:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']


In [None]:
# Test with custom data
test_text = ["Angru Teens paralyze Bangladesh Capital for the Protest"]
test_text_tfidf = tfidf.transform(test_text)
prediction = model.predict(test_text_tfidf)

print(f"Prediction: {news_data.target_names[prediction[0]]}")

Prediction: sci.med


In [None]:
# Add relevant categories for politics
categories = [
    'alt.atheism',
    'comp.graphics',
    'sci.med',
    'soc.religion.christian',
    'talk.politics.misc',
    'talk.politics.guns',
    'talk.politics.mideast'
]

# Reload the dataset with the new categories
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all', categories=categories)

# Check the updated categories
print("Updated Categories:")
print(news_data.target_names)

Updated Categories:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']


In [None]:
# Extract features (text) and labels (categories)
X = news_data.data
y = news_data.target

# Convert the text data to TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
X_tfidf = tfidf.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

print("Model training completed.")

Model training completed.


In [None]:
# Test with custom data
test_text = ["Angry Teens paralyze Bangladesh Capital"]
test_text_tfidf = tfidf.transform(test_text)
prediction = model.predict(test_text_tfidf)

print(f"Prediction: {news_data.target_names[prediction[0]]}")

Prediction: sci.med


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=news_data.target_names))

Classification Report:
                        precision    recall  f1-score   support

           alt.atheism       0.99      0.91      0.95       159
         comp.graphics       0.93      0.98      0.95       205
               sci.med       0.92      0.99      0.95       182
soc.religion.christian       0.96      0.96      0.96       204
    talk.politics.guns       0.96      0.95      0.96       197
 talk.politics.mideast       0.98      0.97      0.98       197
    talk.politics.misc       0.94      0.90      0.92       133

              accuracy                           0.95      1277
             macro avg       0.96      0.95      0.95      1277
          weighted avg       0.96      0.95      0.95      1277



In [None]:
!pip install transformers

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(news_data.target_names))




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))

In [None]:
# Test with custom data
test_text = ["Angry Teens paralyze Bangladesh Capital"]

# Tokenize using BERT tokenizer
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True)

# Predict using the BERT model
outputs = model(**inputs)

# Get the predicted label (adjust based on BERT's output structure)
predicted_label_index = outputs.logits.argmax().item()
print(f"Prediction: {news_data.target_names[predicted_label_index]}")

Prediction: soc.religion.christian
