[Reference](https://texthero.org)

# Import packages

In [1]:
!pip install texthero

Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 4.6MB/s 
Collecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 14.5MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp36-none-any.whl size=1434674 sha256=f43913ebb029fb1f3b7e2f1101f44aaf611326b87fb3b7bb0f05ac81442c2abb
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a

In [2]:
import texthero as hero
import pandas as pd

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load dataset

In [3]:
df = pd.read_csv(
    "https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv"
)
df.head(2)

Unnamed: 0,text,topic
0,Claxton hunting first major medal\n\nBritish h...,athletics
1,O'Sullivan could run in Worlds\n\nSonia O'Sull...,athletics


# Preprocess

In [4]:
df['text'] = hero.clean(df['text'])

In [5]:
df['tfidf'] = (
    hero.tfidf(df['text'], max_features=100)
)
df[["tfidf", "topic"]].head(2)

Unnamed: 0,tfidf,topic
0,"[0.0, 0.13194458247285848, 0.0, 0.0, 0.0, 0.0,...",athletics
1,"[0.0, 0.13056235989725676, 0.0, 0.205187581391...",athletics


# Reduce dimension and visualize the vector space

In [6]:
df['pca'] = hero.pca(df['tfidf'])
hero.scatterplot(
    df, 
    col='pca', 
    color='topic', 
    title="PCA BBC Sport news"
)

# Named entities

In [10]:
df.head()

Unnamed: 0,text,topic,tfidf,pca
0,claxton hunting first major medal british hurd...,athletics,"[0.0, 0.13194458247285848, 0.0, 0.0, 0.0, 0.0,...","[0.02186515313417704, -0.2552300506397517]"
1,sullivan could run worlds sonia sullivan indic...,athletics,"[0.0, 0.13056235989725676, 0.0, 0.205187581391...","[-0.10789848949819328, 0.013164367341919366]"
2,greene sets sights world title maurice greene ...,athletics,"[0.0, 0.2125838971766965, 0.0, 0.0, 0.0, 0.0, ...","[0.04754377364301302, -0.24457204171653765]"
3,iaaf launches fight drugs iaaf athletics world...,athletics,"[0.0, 0.2736433345684428, 0.0, 0.0, 0.26851134...","[0.025504009477301, -0.22789503587953086]"
4,dibaba breaks 000m world record ethiopia tirun...,athletics,"[0.0, 0.0, 0.0, 0.0, 0.152923593668543, 0.0, 0...","[-0.018304896895309862, -0.2151696306981936]"


In [13]:
df['named_entities'] = (hero.named_entities(df['text']))

In [14]:
df[['named_entities', 'topic']].head(2)

Unnamed: 0,named_entities,topic
0,"[(claxton, ORG, 0, 7), (first, ORDINAL, 16, 21...",athletics
1,"[(sullivan, ORG, 0, 8), (sonia sullivan, PERSO...",athletics


# Show top words

In [9]:
NUM_TOP_WORDS = 5
hero.top_words(df['text'])[:NUM_TOP_WORDS]

said       1338
first       790
england     749
game        681
one         671
Name: text, dtype: int64