In [1]:
import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
import nltk

In [2]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Function to read all files from a directory and store them in a DataFrame
def load_newsgroups_data(base_path):
    data = []
    labels = []
    for newsgroup_dir in os.listdir(base_path):
        newsgroup_path = os.path.join(base_path, newsgroup_dir)
        if os.path.isdir(newsgroup_path):
            for file_path in glob.glob(os.path.join(newsgroup_path, '*')):
                if os.path.isfile(file_path):  # Ensure it's a file
                    try:
                        with open(file_path, 'r', errors='ignore') as file:
                            text = file.read()
                            data.append(text)
                            labels.append(newsgroup_dir)
                    except PermissionError as e:
                        print(f"Permission denied for file: {file_path}")
    return pd.DataFrame({'text': data, 'label': labels})

In [4]:
# Load the newsgroups data
newsgroups_path = '20_newsgroups'
newsgroups_data = load_newsgroups_data(newsgroups_path)

In [5]:
# Verify the structure of the DataFrame
print("DataFrame columns:", newsgroups_data.columns)
print("Sample data:", newsgroups_data.head())

DataFrame columns: Index(['text', 'label'], dtype='object')
Sample data:                                                 text        label
0  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...  alt.atheism
1  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism
2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
3  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism


In [6]:
# Check for empty documents and remove them
if 'text' in newsgroups_data.columns:
    empty_docs = newsgroups_data['text'].apply(lambda x: len(x.strip()) == 0)
    print(f"Number of empty documents: {empty_docs.sum()}")
    newsgroups_data = newsgroups_data[~empty_docs]
else:
    print("'text' column is missing in the DataFrame.")

Number of empty documents: 0


In [7]:
# Define a function to preprocess the text data
def preprocess_text(texts):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer.get_feature_names_out()

In [8]:
# Preprocess the text data if 'text' column exists
if 'text' in newsgroups_data.columns:
    tfidf_matrix, feature_names = preprocess_text(newsgroups_data['text'])
else:
    print("Cannot preprocess text as 'text' column is missing.")

In [9]:
# Encode the labels
if 'label' in newsgroups_data.columns:
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(newsgroups_data['label'])
else:
    print("'label' column is missing in the DataFrame.")

In [10]:
# K-means clustering
if 'text' in newsgroups_data.columns:
    kmeans = KMeans(n_clusters=20, random_state=42)
    kmeans_labels = kmeans.fit_predict(tfidf_matrix)
else:
    print("Cannot perform K-means clustering as 'text' column is missing.")

In [11]:
# Latent Dirichlet Allocation (LDA)
if 'text' in newsgroups_data.columns:
    lda = LatentDirichletAllocation(n_components=20, random_state=42)
    lda.fit(tfidf_matrix)
    lda_topics = lda.transform(tfidf_matrix)
else:
    print("Cannot perform LDA as 'text' column is missing.")

In [12]:
# Function to print top words in each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [13]:
# Print the top words for each topic from LDA
if 'text' in newsgroups_data.columns:
    n_top_words = 10
    print("Topics in LDA model:")
    print_top_words(lda, feature_names, n_top_words)
else:
    print("Cannot print LDA topics as 'text' column is missing.")

Topics in LDA model:
Topic #0:
edu uiuc crypt clipper cso com org key security netcom
Topic #1:
edu sci digex space access com med cmu astro net
Topic #2:
nasa caltech gov jpl gap edu keith elroy space uci
Topic #3:
edu ca sport hockey baseball game rec team com news
Topic #4:
stratus ti space edu isc sci com food transfer cmu
Topic #5:
rutgers christian edu athos geneva soc igor aramis hedrick religion
Topic #6:
edu hp mit mil com navy portal buffalo du convex
Topic #7:
edu windows comp graphics os ms com cmu cs ac
Topic #8:
edu talk atheism com religion sandvik alt abortion apple sni
Topic #9:
att cb com uwm nj na 00 50 vs distribution
Topic #10:
cwru pitt freenet cleveland ins edu po sun western usenet
Topic #11:
edu com rec autos cmu ca uk car motorcycles news
Topic #12:
edu columbia cc jhu ctr cmu news mot uchicago sol
Topic #13:
edu sys ibm hardware mac comp pc com cmu drive
Topic #14:
god rutgers edu christian jesus religion people bible athos church
Topic #15:
edu politics talk

In [15]:
# Adding clustering labels to the DataFrame
if 'text' in newsgroups_data.columns:
    newsgroups_data['kmeans_label'] = kmeans_labels
    newsgroups_data['lda_topic'] = lda_topics.argmax(axis=1)
    print(newsgroups_data.head())
else:
    print("Cannot add clustering labels as 'text' column is missing.")

                                                text        label  \
0  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...  alt.atheism   
1  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
3  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   

   kmeans_label  lda_topic  
0             7         14  
1             7         14  
2             7         14  
3             2         14  
4             9         13  
