In [1]:
from pprint import pprint
import pandas as pd
import nltk
import re

import acquire
import prepare

def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [2]:
data = [
    'Python is pretty cool',
    'Python is a nice programming language with nice syntax',
    'I think SQL is cool too',
]

In [3]:
pprint(data)


['Python is pretty cool',
 'Python is a nice programming language with nice syntax',
 'I think SQL is cool too']


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# same basic process as any sklearn transformation:
# make the thing
cv = CountVectorizer()
# use the thing
bag_of_words = cv.fit_transform(data)

In [5]:
bag_of_words

<3x12 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(data)
pprint(data)
pd.DataFrame(bag_of_words.todense(), 
             columns=tfidf.get_feature_names())

['Python is pretty cool',
 'Python is a nice programming language with nice syntax',
 'I think SQL is cool too']




Unnamed: 0,cool,is,language,nice,pretty,programming,python,sql,syntax,think,too,with
0,0.480458,0.373119,0.0,0.0,0.631745,0.0,0.480458,0.0,0.0,0.0,0.0,0.0
1,0.0,0.197673,0.334689,0.669378,0.0,0.334689,0.25454,0.0,0.334689,0.0,0.0,0.334689
2,0.38377,0.298032,0.0,0.0,0.0,0.0,0.0,0.504611,0.0,0.504611,0.504611,0.0


In [7]:
news = acquire.get_news_articles()
news

Unnamed: 0,title,content,category
0,"Tata Power hit by cyberattack, says 'all criti...",Tata Power has confirmed that it was hit by a ...,technology
1,What are the rates at which employees are leav...,Infosys has reported an attrition rate of 27.1...,technology
2,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...,technology
3,Electric car goes from zero to 100 kmph in 1.4...,An electric car in Germany has set the Guinnes...,technology
4,SpaceX says it cannot fund Starlink in Ukraine...,"SpaceX, in a letter to the Pentagon, said that...",technology
...,...,...,...
95,Twitter user calls Big B 'weakest link' of 'Ch...,Veteran actor Dharmendra responded to a fan wh...,entertainment
96,"John is a gentleman, was a pleasure to work wi...","Actress Manushi Chhillar, speaking about her u...",entertainment
97,It's an Indian spy-thriller without bikinis & ...,"Actor Karthi, while speaking about his upcomin...",entertainment
98,"Was obsessed with Sridevi, I used to imitate h...",Actress Shefali Shah said in an interview that...,entertainment


In [9]:
news['cleaned'] = news.content.apply(prepare.basic_clean)

In [10]:
news

Unnamed: 0,title,content,category,cleaned
0,"Tata Power hit by cyberattack, says 'all criti...",Tata Power has confirmed that it was hit by a ...,technology,tata power has confirmed that it was hit by a ...
1,What are the rates at which employees are leav...,Infosys has reported an attrition rate of 27.1...,technology,infosys has reported an attrition rate of 271 ...
2,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...,technology,infosys ceo salil parekh has revealed that the...
3,Electric car goes from zero to 100 kmph in 1.4...,An electric car in Germany has set the Guinnes...,technology,an electric car in germany has set the guinnes...
4,SpaceX says it cannot fund Starlink in Ukraine...,"SpaceX, in a letter to the Pentagon, said that...",technology,spacex in a letter to the pentagon said that i...
...,...,...,...,...
95,Twitter user calls Big B 'weakest link' of 'Ch...,Veteran actor Dharmendra responded to a fan wh...,entertainment,veteran actor dharmendra responded to a fan wh...
96,"John is a gentleman, was a pleasure to work wi...","Actress Manushi Chhillar, speaking about her u...",entertainment,actress manushi chhillar speaking about her up...
97,It's an Indian spy-thriller without bikinis & ...,"Actor Karthi, while speaking about his upcomin...",entertainment,actor karthi while speaking about his upcoming...
98,"Was obsessed with Sridevi, I used to imitate h...",Actress Shefali Shah said in an interview that...,entertainment,actress shefali shah said in an interview that...


In [12]:
news['cleaned'] = news.cleaned.apply(prepare.remove_stopwords)

In [13]:
news

Unnamed: 0,title,content,category,cleaned
0,"Tata Power hit by cyberattack, says 'all criti...",Tata Power has confirmed that it was hit by a ...,technology,tata power confirmed hit cyberattack impacted ...
1,What are the rates at which employees are leav...,Infosys has reported an attrition rate of 27.1...,technology,infosys reported attrition rate 271 july septe...
2,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...,technology,infosys ceo salil parekh revealed company let ...
3,Electric car goes from zero to 100 kmph in 1.4...,An electric car in Germany has set the Guinnes...,technology,electric car germany set guinness world record...
4,SpaceX says it cannot fund Starlink in Ukraine...,"SpaceX, in a letter to the Pentagon, said that...",technology,spacex letter pentagon said longer donate fund...
...,...,...,...,...
95,Twitter user calls Big B 'weakest link' of 'Ch...,Veteran actor Dharmendra responded to a fan wh...,entertainment,veteran actor dharmendra responded fan tweeted...
96,"John is a gentleman, was a pleasure to work wi...","Actress Manushi Chhillar, speaking about her u...",entertainment,actress manushi chhillar speaking upcoming fil...
97,It's an Indian spy-thriller without bikinis & ...,"Actor Karthi, while speaking about his upcomin...",entertainment,actor karthi speaking upcoming film sardar sai...
98,"Was obsessed with Sridevi, I used to imitate h...",Actress Shefali Shah said in an interview that...,entertainment,actress shefali shah said interview earlier ob...


In [15]:
news['cleaned'] = news.cleaned.apply(prepare.lemmatize)

In [16]:
news

Unnamed: 0,title,content,category,cleaned
0,"Tata Power hit by cyberattack, says 'all criti...",Tata Power has confirmed that it was hit by a ...,technology,tata power confirmed hit cyberattack impacted ...
1,What are the rates at which employees are leav...,Infosys has reported an attrition rate of 27.1...,technology,infosys reported attrition rate 271 july septe...
2,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...,technology,infosys ceo salil parekh revealed company let ...
3,Electric car goes from zero to 100 kmph in 1.4...,An electric car in Germany has set the Guinnes...,technology,electric car germany set guinness world record...
4,SpaceX says it cannot fund Starlink in Ukraine...,"SpaceX, in a letter to the Pentagon, said that...",technology,spacex letter pentagon said longer donate fund...
...,...,...,...,...
95,Twitter user calls Big B 'weakest link' of 'Ch...,Veteran actor Dharmendra responded to a fan wh...,entertainment,veteran actor dharmendra responded fan tweeted...
96,"John is a gentleman, was a pleasure to work wi...","Actress Manushi Chhillar, speaking about her u...",entertainment,actress manushi chhillar speaking upcoming fil...
97,It's an Indian spy-thriller without bikinis & ...,"Actor Karthi, while speaking about his upcomin...",entertainment,actor karthi speaking upcoming film sardar sai...
98,"Was obsessed with Sridevi, I used to imitate h...",Actress Shefali Shah said in an interview that...,entertainment,actress shefali shah said interview earlier ob...
