**Exp5 NLP basics with Built in stuff on colab**

In [5]:
# Concise NLP Operations Demo
# --------------------------

# Essential imports
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# 1. Load a subset of the 20 Newsgroups dataset (just 2 categories for simplicity)
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups = fetch_20newsgroups(
    subset='all',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=('headers', 'footers', 'quotes')
)

# Quick dataset overview
print(f"Dataset: {len(newsgroups.data)} documents, {len(categories)} categories")
print(f"Sample text:\n{newsgroups.data[0][:300]}...\n")

# 2. Simple text preprocessing function
def basic_preprocess(text):
    # Convert to lowercase and remove punctuation/digits using a single comprehension
    return ' '.join(word.lower() for word in text.split()
                   if word.isalpha() and len(word) > 2)

# Process a sample
sample_processed = basic_preprocess(newsgroups.data[0])
print(f"After preprocessing:\n{sample_processed[:300]}...\n")

# 3. Text vectorization - TF-IDF
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X = tfidf.fit_transform(newsgroups.data)
y = newsgroups.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")

# 4. Train a classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 5. Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=categories))

# 6. Feature importance - most distinctive words for each category
feature_names = np.array(tfidf.get_feature_names_out())

def show_top_features(classifier, feature_names, class_labels, n=10):
    for i, class_label in enumerate(class_labels):
        top_indices = np.argsort(classifier.feature_log_prob_[i])[-n:]
        top_features = feature_names[top_indices]
        print(f"\nTop {n} words for '{class_label}':")
        print(", ".join(top_features))

show_top_features(clf, feature_names, categories)

# 7. Classify a new document
new_docs = [
    "I believe in the power of faith and Jesus Christ as our savior.",
    "Religion has no scientific basis and relies on blind faith."
]

# Vectorize and predict
new_docs_tfidf = tfidf.transform(new_docs)
predictions = clf.predict(new_docs_tfidf)

print("\nDocument Classification:")
for doc, pred in zip(new_docs, predictions):
    print(f"Text: '{doc[:50]}...'")
    print(f"Predicted: {categories[pred]}\n")

print("NLP operations demo complete!")

Dataset: 1796 documents, 2 categories
Sample text:
genealogical 
old 

Well, since my wife is (in your gentle term) a "bastard", I can
probably speak with a bit of authority on this. Any "stigma"
associated with children conceived and/or born out of wedlock rests
solely upon the parents--they've committed a sexual transgression for
which they should...

After preprocessing:
genealogical old since wife your gentle can probably speak with bit authority any associated with children conceived born out wedlock rests solely upon the committed sexual transgression for which they should the child itself has priori limitations him the concept blaming the child for the sins one ...

Training data: (1257, 1000), Test data: (539, 1000)

Accuracy: 0.8182

Classification Report:
                        precision    recall  f1-score   support

           alt.atheism       0.80      0.77      0.79       235
soc.religion.christian       0.83      0.85      0.84       304

              accuracy        