In [2]:
import os
import tarfile
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import re

In [13]:
# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MANORANJAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MANORANJAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MANORANJAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data_path = r"C:\Users\MANORANJAN\Downloads\twenty+newsgroups\20_newsgroups.tar.gz"
extracted_path = r"C:\Users\MANORANJAN\Downloads\twenty+newsgroups\20_newsgroups"

In [5]:
if not os.path.exists(extracted_path):
    with tarfile.open(data_path, 'r:gz') as tar:
        tar.extractall(path=os.path.dirname(data_path))

In [6]:
# Load the data from the specific category
category_path = os.path.join(extracted_path, 'misc.forsale')
documents = []

for file_name in os.listdir(category_path):
    file_path = os.path.join(category_path, file_name)
    with open(file_path, 'r', encoding='latin1') as file:
        documents.append(file.read())


In [7]:
# Preprocess the text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

documents = [preprocess(doc) for doc in documents]

In [9]:
# Convert the text data into numerical form using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')
X = vectorizer.fit_transform(documents)

In [10]:
# Apply LDA for topic modeling
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X)

In [11]:
# Display the topics found by LDA
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

Topic 0:
georgia institute technology pom gtcprismgatechedu gtaprismgatechedu hydragatechedu atlanta contains ralf
Topic 1:
rupindangdartmouthedu hanover newsdartvaxdartmouthedu dartmouth nh rupin dang uk amp manager
Topic 2:
steve keldsen lh sm dan research djkccwfccutexasedu steveaprroostertitantsdarlututexasedu texas glicker
Topic 3:
kou douglas hiram movie vhs custom live college barrel leepolarsunrnarockefelleredu
Topic 4:
sale miscforsale gmt university sender drive offer email nntppostinghost new
Topic 5:
bitzmcolumbiadsuedu eric hotel clemson voucher dakota typewriter sc bitz package
Topic 6:
brook writes stony bulletin commercial neal dennis york jack pchangicsunysbedu
Topic 7:
mile sega toyota brian oplingerracrdgecom car interior amfm genesis research
Topic 8:
nikon room writes junk camera andy aprlelandstanfordedu jumper mailing lens
Topic 9:
00 10 50 comic cover 15 copy 1st 20 25


In [12]:
# Apply K-means for document clustering
num_clusters = 10
km = KMeans(n_clusters=num_clusters, random_state=42)
km.fit(X)

In [14]:
# Attach the cluster labels to the documents
df = pd.DataFrame({'Document': documents, 'Cluster': km.labels_})

In [15]:
# Display the clustering results
print(df.head())

                                            Document  Cluster
0  path cantaloupesrvcscmuedurochesterudelgatechh...        9
1  path cantaloupesrvcscmueducrabapplesrvcscmuedu...        9
2  path cantaloupesrvcscmuedudasnewsharvardeduogi...        3
3  newsgroups miscforsale path cantaloupesrvcscmu...        0
4  path cantaloupesrvcscmueducrabapplesrvcscmuedu...        8


In [16]:
# Save the clustering results to a CSV file
df.to_csv('document_clusters.csv', index=False)