In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import contractions
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score


In [68]:
data_load = load_files('mini_newsgroups',encoding='latin1',decode_error='ignore')

In [69]:
data = pd.DataFrame({
    'text': data_load.data,
    'target': data_load.target,
    'category': [data_load.target_names[i] for i in data_load.target]
})

In [70]:
data

Unnamed: 0,text,target,category
0,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,4,comp.sys.mac.hardware
1,Newsgroups: sci.crypt\nPath: cantaloupe.srv.cs...,11,sci.crypt
2,"Newsgroups: sci.crypt,alt.privacy.clipper\nPat...",11,sci.crypt
3,Newsgroups: rec.autos\nPath: cantaloupe.srv.cs...,7,rec.autos
4,Xref: cantaloupe.srv.cs.cmu.edu soc.culture.ar...,17,talk.politics.mideast
...,...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu rec.motorcycle...,8,rec.motorcycles
1996,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,12,sci.electronics
1997,Newsgroups: talk.politics.guns\nPath: cantalou...,16,talk.politics.guns
1998,Path: cantaloupe.srv.cs.cmu.edu!rochester!udel...,5,comp.windows.x


In [71]:
def text_preprocessing(text):
    headers = [
        'Message-ID', 'Date', 'From', 'Subject', 'Path', 'Newsgroups', 'Xref',
        'Lines', 'Organization', 'References', 'Sender', 'Article-I.D.',
        'NNTP-Posting-Host', 'Summary', 'Distribution', 'Originator', 'X-Newsreader', 'Reply-To',
        'Keywords', 'Supersedes', 'Expires'
    ]
    pattern = r'^(?:' + '|'.join(headers) + r'):.*$'
    text = re.sub(pattern, '', text, flags=re.MULTILINE)

    # Remove lines with Host, Keywords, Supersedes, Expires anywhere
    text = re.sub(r'^.*(Host|Keywords|Supersedes|Expires).*$','', text, flags=re.MULTILINE)

    # Remove lines starting with 'In article' or similar reply references
    text = re.sub(r'^\s*In article.*$', '', text, flags=re.MULTILINE)

    # Remove quoted lines starting with '>'
    text = re.sub(r'^\s*>.*$', '', text, flags=re.MULTILINE)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove anything inside angled brackets <>
    text = re.sub(r'<[^>]+>', '', text)

    # Remove multiple newlines and replace with a single space
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


In [72]:
data['text'] = data['text'].apply(lambda x: text_preprocessing(x))

In [73]:
data

Unnamed: 0,text,target,category
0,Excuse me but... have not all Macs got a CPU!!...,4,comp.sys.mac.hardware
1,Deeply grateful for citations to any papers on...,11,sci.crypt
2,"The system, or 'family', key would appear to b...",11,sci.crypt
3,In (Mark Wayne Blunier) writes: Sheesh! I don'...,7,rec.autos
4,"Hate to be simple minded about this Tim, but I...",17,talk.politics.mideast
...,...,...,...
1995,Anyone interesting in a mailing list for Harle...,8,rec.motorcycles
1996,Does anyone know what causes the ever-growing ...,12,sci.electronics
1997,Two notes of interest from Texas: The Tarrant ...,16,talk.politics.guns
1998,Is there any FAQ list for Programming in X win...,5,comp.windows.x


In [74]:
data.head()

Unnamed: 0,text,target,category
0,Excuse me but... have not all Macs got a CPU!!...,4,comp.sys.mac.hardware
1,Deeply grateful for citations to any papers on...,11,sci.crypt
2,"The system, or 'family', key would appear to b...",11,sci.crypt
3,In (Mark Wayne Blunier) writes: Sheesh! I don'...,7,rec.autos
4,"Hate to be simple minded about this Tim, but I...",17,talk.politics.mideast


In [75]:
data['text'] = data['text'].apply(lambda x: word_tokenize(x))

In [76]:
data

Unnamed: 0,text,target,category
0,"[Excuse, me, but, ..., have, not, all, Macs, g...",4,comp.sys.mac.hardware
1,"[Deeply, grateful, for, citations, to, any, pa...",11,sci.crypt
2,"[The, system, ,, or, 'family, ', ,, key, would...",11,sci.crypt
3,"[In, (, Mark, Wayne, Blunier, ), writes, :, Sh...",7,rec.autos
4,"[Hate, to, be, simple, minded, about, this, Ti...",17,talk.politics.mideast
...,...,...,...
1995,"[Anyone, interesting, in, a, mailing, list, fo...",8,rec.motorcycles
1996,"[Does, anyone, know, what, causes, the, ever-g...",12,sci.electronics
1997,"[Two, notes, of, interest, from, Texas, :, The...",16,talk.politics.guns
1998,"[Is, there, any, FAQ, list, for, Programming, ...",5,comp.windows.x


In [77]:
stop_words = set(stopwords.words('english'))

In [78]:
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

In [79]:
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))

In [80]:
data

Unnamed: 0,text,target,category
0,"[Excuse, ..., Macs, got, CPU, !, !, !, Alain]",4,comp.sys.mac.hardware
1,"[Deeply, grateful, citations, papers, electron...",11,sci.crypt
2,"[system, ,, 'family, ', ,, key, would, appear,...",11,sci.crypt
3,"[(, Mark, Wayne, Blunier, ), writes, :, Sheesh...",7,rec.autos
4,"[Hate, simple, minded, Tim, ,, think, really, ...",17,talk.politics.mideast
...,...,...,...
1995,"[Anyone, interesting, mailing, list, Harley-Da...",8,rec.motorcycles
1996,"[anyone, know, causes, ever-growing, black, bo...",12,sci.electronics
1997,"[Two, notes, interest, Texas, :, Tarrant, Couo...",16,talk.politics.guns
1998,"[FAQ, list, Programming, X, windows, ?, Thankx...",5,comp.windows.x


In [81]:
lemmatizer = WordNetLemmatizer()

In [82]:
def get_wordnet_pos(treebank_tag):
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [83]:
def lemmatize_tokens(tokens):
    pos_tags = pos_tag(tokens)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]

In [84]:
data['text'] = data['text'].apply(lambda x: lemmatize_tokens(x))

In [85]:
data

Unnamed: 0,text,target,category
0,"[Excuse, ..., Macs, get, CPU, !, !, !, Alain]",4,comp.sys.mac.hardware
1,"[Deeply, grateful, citation, paper, electronic...",11,sci.crypt
2,"[system, ,, 'family, ', ,, key, would, appear,...",11,sci.crypt
3,"[(, Mark, Wayne, Blunier, ), write, :, Sheesh,...",7,rec.autos
4,"[Hate, simple, mind, Tim, ,, think, really, si...",17,talk.politics.mideast
...,...,...,...
1995,"[Anyone, interest, mail, list, Harley-Davidson...",8,rec.motorcycles
1996,"[anyone, know, cause, ever-growing, black, bor...",12,sci.electronics
1997,"[Two, note, interest, Texas, :, Tarrant, Couon...",16,talk.politics.guns
1998,"[FAQ, list, Programming, X, window, ?, Thankx,...",5,comp.windows.x
