In [49]:
import re
import nltk

In [50]:
text = "Microsoft Corporation is an American multinational corporation and technology conglomerate headquartered in Redmond, Washington.[2] Founded in 1975, the company became influential in the rise of personal computers through software like Windows, and the company has since expanded to Internet services, cloud computing, video gaming and other fields. Microsoft is the largest software maker, one of the most valuable public U.S. companies,[a] and one of the most valuable brands globally."

1. Lower Case

In [51]:
text = text.lower()
text

'microsoft corporation is an american multinational corporation and technology conglomerate headquartered in redmond, washington.[2] founded in 1975, the company became influential in the rise of personal computers through software like windows, and the company has since expanded to internet services, cloud computing, video gaming and other fields. microsoft is the largest software maker, one of the most valuable public u.s. companies,[a] and one of the most valuable brands globally.'


2. Remove Symbols and Numbers

In [52]:
text = re.sub(r'[^a-zA-Z\s]','', text)
text

'microsoft corporation is an american multinational corporation and technology conglomerate headquartered in redmond washington founded in  the company became influential in the rise of personal computers through software like windows and the company has since expanded to internet services cloud computing video gaming and other fields microsoft is the largest software maker one of the most valuable public us companiesa and one of the most valuable brands globally'

3. Tokenize Word

In [53]:
tokens = nltk.word_tokenize(text)
tokens

['microsoft',
 'corporation',
 'is',
 'an',
 'american',
 'multinational',
 'corporation',
 'and',
 'technology',
 'conglomerate',
 'headquartered',
 'in',
 'redmond',
 'washington',
 'founded',
 'in',
 'the',
 'company',
 'became',
 'influential',
 'in',
 'the',
 'rise',
 'of',
 'personal',
 'computers',
 'through',
 'software',
 'like',
 'windows',
 'and',
 'the',
 'company',
 'has',
 'since',
 'expanded',
 'to',
 'internet',
 'services',
 'cloud',
 'computing',
 'video',
 'gaming',
 'and',
 'other',
 'fields',
 'microsoft',
 'is',
 'the',
 'largest',
 'software',
 'maker',
 'one',
 'of',
 'the',
 'most',
 'valuable',
 'public',
 'us',
 'companiesa',
 'and',
 'one',
 'of',
 'the',
 'most',
 'valuable',
 'brands',
 'globally']

4. Stopwords removal/filtering

In [54]:
from nltk.corpus import stopwords

In [55]:
stopW = stopwords.words('English')
stopW

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [56]:
filtered = []
for word in tokens :
    if word not in stopW:
        filtered.append(word)
filtered        


['microsoft',
 'corporation',
 'american',
 'multinational',
 'corporation',
 'technology',
 'conglomerate',
 'headquartered',
 'redmond',
 'washington',
 'founded',
 'company',
 'became',
 'influential',
 'rise',
 'personal',
 'computers',
 'software',
 'like',
 'windows',
 'company',
 'since',
 'expanded',
 'internet',
 'services',
 'cloud',
 'computing',
 'video',
 'gaming',
 'fields',
 'microsoft',
 'largest',
 'software',
 'maker',
 'one',
 'valuable',
 'public',
 'us',
 'companiesa',
 'one',
 'valuable',
 'brands',
 'globally']

5. Stemming/Lemmetization

In [57]:
from nltk.stem import PorterStemmer

In [58]:
stemmer = PorterStemmer()

In [59]:
stmmedWords =[stemmer.stem(words) for words in filtered]
stmmedWords

['microsoft',
 'corpor',
 'american',
 'multin',
 'corpor',
 'technolog',
 'conglomer',
 'headquart',
 'redmond',
 'washington',
 'found',
 'compani',
 'becam',
 'influenti',
 'rise',
 'person',
 'comput',
 'softwar',
 'like',
 'window',
 'compani',
 'sinc',
 'expand',
 'internet',
 'servic',
 'cloud',
 'comput',
 'video',
 'game',
 'field',
 'microsoft',
 'largest',
 'softwar',
 'maker',
 'one',
 'valuabl',
 'public',
 'us',
 'companiesa',
 'one',
 'valuabl',
 'brand',
 'global']

6. Back to Sentance

In [60]:
preprocessed = ' '.join(stmmedWords)
preprocessed

'microsoft corpor american multin corpor technolog conglomer headquart redmond washington found compani becam influenti rise person comput softwar like window compani sinc expand internet servic cloud comput video game field microsoft largest softwar maker one valuabl public us companiesa one valuabl brand global'

In [61]:
from nltk.stem import WordNetLemmatizer

In [62]:
lemmatization = WordNetLemmatizer()

In [63]:
lemmetize = [lemmatization.lemmatize(words) for words in filtered]
lemmetize

['microsoft',
 'corporation',
 'american',
 'multinational',
 'corporation',
 'technology',
 'conglomerate',
 'headquartered',
 'redmond',
 'washington',
 'founded',
 'company',
 'became',
 'influential',
 'rise',
 'personal',
 'computer',
 'software',
 'like',
 'window',
 'company',
 'since',
 'expanded',
 'internet',
 'service',
 'cloud',
 'computing',
 'video',
 'gaming',
 'field',
 'microsoft',
 'largest',
 'software',
 'maker',
 'one',
 'valuable',
 'public',
 'u',
 'companiesa',
 'one',
 'valuable',
 'brand',
 'globally']

In [64]:
lem_preprocess = ' '.join(lemmetize)
lem_preprocess

'microsoft corporation american multinational corporation technology conglomerate headquartered redmond washington founded company became influential rise personal computer software like window company since expanded internet service cloud computing video gaming field microsoft largest software maker one valuable public u companiesa one valuable brand globally'