### Importing Libraries

In [1]:
import numpy as np

### Importing Dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
text_data = fetch_20newsgroups()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
type(text_data)

sklearn.utils.Bunch

In [5]:
raw_text = text_data.data[:4]                                                    #first four paragraphs
raw_text

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

### Data Preprocessing

#### Stage 1 : Convert to lower text

In [6]:
clean_text_1 = []
def to_lower_case(data):
    for words in raw_text:
      clean_text_1.append(str.lower(words))

In [7]:
to_lower_case(raw_text)

In [8]:
clean_text_1

["from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?\nnntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day. it was a 2-door sports car, looked to be from the late 60s/\nearly 70s. it was called a bricklin. the doors were really small. in addition,\nthe front bumper was separate from the rest of the body. this is \nall i know. if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nthanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----\n\n\n\n\n",
 "from: guykuo@carson.u.washington.edu (guy kuo)\nsubject: si clock poll - final call\nsummary: final call for si clock reports\nkeywords: si,acceleration,clock,upgrade\narticle-i.d.: shelley.1qvfo9innc3s\norganization: university of washington\nlines: 

####Stage 2 : Tokenization

In [9]:
clean_text_2=[]
from nltk.tokenize import sent_tokenize , word_tokenize                         #for sentence and word tokenization
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
#word tokenize

clean_text_2 = [word_tokenize(i) for  i in clean_text_1]

In [11]:
clean_text_2

[['from',
  ':',
  'lerxst',
  '@',
  'wam.umd.edu',
  '(',
  'where',
  "'s",
  'my',
  'thing',
  ')',
  'subject',
  ':',
  'what',
  'car',
  'is',
  'this',
  '!',
  '?',
  'nntp-posting-host',
  ':',
  'rac3.wam.umd.edu',
  'organization',
  ':',
  'university',
  'of',
  'maryland',
  ',',
  'college',
  'park',
  'lines',
  ':',
  '15',
  'i',
  'was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'i',
  'saw',
  'the',
  'other',
  'day',
  '.',
  'it',
  'was',
  'a',
  '2-door',
  'sports',
  'car',
  ',',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  '60s/',
  'early',
  '70s',
  '.',
  'it',
  'was',
  'called',
  'a',
  'bricklin',
  '.',
  'the',
  'doors',
  'were',
  'really',
  'small',
  '.',
  'in',
  'addition',
  ',',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  '.',
  'this',
  'is',
  'all',
  'i',
  'know',
  '

####Stage 3 : Converting to regular expressions

In [12]:
import re                             #regular expression , to remove punctuations , special characters etc.

clean_text_3 = []

for words in clean_text_2:
  clean = []
  for w in words :
    res = re.sub(r'[^\w\s]', "" , w)
    if res != "":
      clean.append(res)
    clean_text_3.append(clean)

In [13]:
clean_text_3

Output hidden; open in https://colab.research.google.com to view.

#### Stage 4 : Removing stopwords 

In [14]:
#removing repeated words which dont have any meaning in the sentense like "the" 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords

In [16]:
clean_text_4 = []

for words in clean_text_3 :
  w = []
  for word in words :
    if not word in stopwords.words('english'):
      w.append(word)
    clean_text_4.append(w)  

In [17]:
clean_text_4

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sports',
  'car',
  'looked',
  'late',
  '60s',
  'early',
  '70s',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'email',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'wondering',
  'anyone',
  'could',
  'enli

####Stage 5 : Stemming

In [18]:
#Converting words into their root form , removing prefix , postfix

from nltk.stem.porter import PorterStemmer
port = PorterStemmer()

In [19]:
a = [port.stem(i) for i in ["reading" , "washing" , "wash" ,"Driving"]]
a

['read', 'wash', 'wash', 'drive']

In [20]:
clean_text_5 = []

for words in clean_text_4 :
  w = []
  for word in words:
    w.append(word)
  clean_text_5.append(w)

In [21]:
clean_text_5 

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sports',
  'car',
  'looked',
  'late',
  '60s',
  'early',
  '70s',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'email',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'wondering',
  'anyone',
  'could',
  'enli

####Stage 6 : Lemmatization

In [22]:
from nltk.stem.wordnet import WordNetLemmatizer
wnet = WordNetLemmatizer()

In [23]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [24]:
lem = []

for words in clean_text_4:
  w = []
  for word in words:
    w.append(wnet.lemmatize(word))
  lem.append(w)  

In [25]:
lem

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'line',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sport',
  'car',
  'looked',
  'late',
  '60',
  'early',
  '70',
  'called',
  'bricklin',
  'door',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'spec',
  'year',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'email',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'line',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',


####Difference between original raw text and Cleaned data

In [26]:
print(raw_text)

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n", "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 1

In [27]:
print(clean_text_5[:1])

[['lerxst', 'wamumdedu', 'thing', 'subject', 'car', 'nntppostinghost', 'rac3wamumdedu', 'organization', 'university', 'maryland', 'college', 'park', 'lines', '15', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', '2door', 'sports', 'car', 'looked', 'late', '60s', 'early', '70s', 'called', 'bricklin', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'email', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst']]
