In [16]:
# Import library
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.decomposition import LatentDirichletAllocation


# Dataset: Load movie reviews
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.baseball', 'sci.med']  # Example categories
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Display sample data
print("Sample Text:")
print(data.data[0])
print("\nTarget Classes:", data.target_names)



Sample Text:
My family doctor and the physiotherapist (PT) she sent me to agree that the
pain in my left shoulder is bursitis. I have an appointment with an orthpod
(I love that, it's short for 'orthopedic surgeon, apparently) but while I'm
waiting the PT is treating me.

She's using hot packs, ultrasound, and lasers, but there's no improvement
yet. In fact, I almost suspect it's getting worse.

My real question is about the laser treatment. I can't easily imagine what
the physical effect that could have on a deep tissue problem. Can anyone
shed some light (so to speak) on the matter?

Target Classes: ['rec.sport.baseball', 'sci.med']


In [17]:


# Split dataset
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=42)


# Sub-bab 3: Representing Text Data as Bag-of-Words
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

print("\nBag-of-Words Representation (Shape):", X_train_bow.shape)


# Sub-bab 6: Stopwords
vectorizer_bow_stopwords = CountVectorizer(stop_words='english')
X_train_bow_stop = vectorizer_bow_stopwords.fit_transform(X_train)
print("\nBag-of-Words without Stopwords (Shape):", X_train_bow_stop.shape)


# Sub-bab 7: Rescaling the Data with tf-idf
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("\nTF-IDF Representation (Shape):", X_train_tfidf.shape)


# Sub-bab 8: Investigating Model Coefficients
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Display top features per class
def display_top_features(vectorizer, model, class_labels, n=10):
    feature_names = np.array(vectorizer.get_feature_names_out())
    for i, class_label in enumerate(class_labels):
        top_features = np.argsort(model.feature_log_prob_[i])[-n:]
        print(f"\nTop features for class '{class_label}':")
        print(feature_names[top_features])

display_top_features(tfidf_vectorizer, model, data.target_names)




Bag-of-Words Representation (Shape): (1388, 19243)

Bag-of-Words without Stopwords (Shape): (1388, 18946)

TF-IDF Representation (Shape): (1388, 19243)

Top features for class 'rec.sport.baseball':
['it' 'you' 'is' 'that' 'and' 'of' 'in' 'he' 'to' 'the']

Top features for class 'sci.med':
['for' 'you' 'that' 'in' 'it' 'and' 'is' 'of' 'to' 'the']


In [18]:


# Sub-bab 9: Bag-of-Words with More Than One Word (n-Grams)
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
print("\nBag-of-Words with n-Grams (Shape):", X_train_ngram.shape)


# Sub-bab 11: Topic Modeling and Document Clustering (LDA)
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X_train_bow)

print("\nTop words per topic:")
for idx, topic in enumerate(lda.components_):
    top_words = [vectorizer_bow.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
    print(f"Topic {idx}: {' '.join(top_words)}")

# Sub-bab 12: Evaluating the Model
y_pred = model.predict(X_test_tfidf)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))


# Sub-bab 13: Summary and Outlook
print("\nSummary:")
print("1. Bag-of-Words and TF-IDF are fundamental for text data representation.")
print("2. Stopwords and n-Grams enhance feature extraction.")
print("3. LDA helps in unsupervised topic discovery.")




Bag-of-Words with n-Grams (Shape): (1388, 140994)

Top words per topic:
Topic 0: you for it that is in and of to the
Topic 1: cancer 92 10 edu to for in and the of

Classification Report:
                    precision    recall  f1-score   support

rec.sport.baseball       0.97      0.97      0.97       294
           sci.med       0.97      0.97      0.97       302

          accuracy                           0.97       596
         macro avg       0.97      0.97      0.97       596
      weighted avg       0.97      0.97      0.97       596


Summary:
1. Bag-of-Words and TF-IDF are fundamental for text data representation.
2. Stopwords and n-Grams enhance feature extraction.
3. LDA helps in unsupervised topic discovery.
