# Term Frequency and Inverse Document Frequency

In [1]:
!pip install nltk



## Scraping Wikipedia

In [2]:
!pip install requests beautifulsoup4 lxml pandas



In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml


In [5]:
url = "https://en.wikipedia.org/wiki/Football"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, "lxml")





In [6]:
content = soup.find_all('p')
with open('robot.txt', "w") as f:
  for i in content:
    f.write(i.text)

In [7]:
file = open("robot.txt", "r")
print(file.read())


Football is a family of team sports in which the object is to get the ball over a goal line, into a goal, or between goalposts using merely the body (by carrying, throwing, or kicking).[1][2][3]
Unqualified, the word football generally means the form of football that is the most popular where the word is used. Sports commonly called football include association football (known as soccer in Australia, Canada, South Africa, the United States, and sometimes in Ireland and New Zealand); Australian rules football; Gaelic football; gridiron football (specifically American football, arena football, or Canadian football); International rules football; rugby league football; and rugby union football.[4] These various forms of football share, to varying degrees, common origins and are known as "football codes".
There are a number of references to traditional, ancient, or prehistoric ball games played in many different parts of the world.[5][6][7] Contemporary codes of football can be traced bac

## Applying Tokenization on text using NLTK

In [9]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [10]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
file = open("robot.txt", "r")
content = file.read()
file.close()

In [12]:
sentence = nltk.sent_tokenize(content)

In [13]:
type(sentence)

list

In [14]:
sentence[0]

'\nFootball is a family of team sports in which the object is to get the ball over a goal line, into a goal, or between goalposts using merely the body (by carrying, throwing, or kicking).'

## Grammer

In [15]:
stemmer = PorterStemmer()


In [17]:
!pip install nltk



In [21]:
from nltk.stem import WordNetLemmatizer

In [22]:
lammatizer = WordNetLemmatizer()

In [23]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [24]:
len(sentence)

345

In [25]:
import re
corpus = []
for i in range(len(sentence)):
  review = re.sub("[^a-zA-Z]", " ", sentence[i])
  review = review.lower()
  corpus.append(review)

In [26]:
corpus

[' football is a family of team sports in which the object is to get the ball over a goal line  into a goal  or between goalposts using merely the body  by carrying  throwing  or kicking  ',
 '          unqualified  the word football generally means the form of football that is the most popular where the word is used ',
 'sports commonly called football include association football  known as soccer in australia  canada  south africa  the united states  and sometimes in ireland and new zealand   australian rules football  gaelic football  gridiron football  specifically american football  arena football  or canadian football   international rules football  rugby league football  and rugby union football ',
 '    these various forms of football share  to varying degrees  common origins and are known as  football codes  ',
 'there are a number of references to traditional  ancient  or prehistoric ball games played in many different parts of the world ',
 '          contemporary codes of f

In [27]:
for i in corpus:
  words = nltk.word_tokenize(i)
  for word in words:
    if word not in set(stopwords.words("english")):
      print(stemmer.stem(word))

footbal
famili
team
sport
object
get
ball
goal
line
goal
goalpost
use
mere
bodi
carri
throw
kick
unqualifi
word
footbal
gener
mean
form
footbal
popular
word
use
sport
commonli
call
footbal
includ
associ
footbal
known
soccer
australia
canada
south
africa
unit
state
sometim
ireland
new
zealand
australian
rule
footbal
gaelic
footbal
gridiron
footbal
specif
american
footbal
arena
footbal
canadian
footbal
intern
rule
footbal
rugbi
leagu
footbal
rugbi
union
footbal
variou
form
footbal
share
vari
degre
common
origin
known
footbal
code
number
refer
tradit
ancient
prehistor
ball
game
play
mani
differ
part
world
contemporari
code
footbal
trace
back
codif
game
english
public
school
th
centuri
outgrowth
mediev
footbal
expans
cultur
power
british
empir
allow
rule
footbal
spread
area
british
influenc
outsid
directli
control
empir
end
th
centuri
distinct
region
code
alreadi
develop
gaelic
footbal
exampl
deliber
incorpor
rule
local
tradit
footbal
game
maintain
heritag
footbal
leagu
found
england
becom

## Feature Extraction

In [28]:
processed_corpus = []
for sentence_text in corpus:
    words = nltk.word_tokenize(sentence_text)
    filtered_words = []
    for word in words:
        if word not in stopwords.words('english'):
            filtered_words.append(stemmer.stem(word))
    processed_corpus.append(" ".join(filtered_words))

corpus = processed_corpus

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True, ngram_range=(2,3))

x = cv.fit_transform(corpus)

In [38]:
cv.vocabulary_

{'footbal famili': 2636,
 'famili team': 2327,
 'team sport': 6837,
 'sport object': 6593,
 'object get': 4730,
 'get ball': 3195,
 'ball goal': 543,
 'goal line': 3238,
 'line goal': 4114,
 'goal goalpost': 3236,
 'goalpost use': 3266,
 'use mere': 7319,
 'mere bodi': 4435,
 'bodi carri': 801,
 'carri throw': 1005,
 'throw kick': 6953,
 'footbal famili team': 2637,
 'famili team sport': 2328,
 'team sport object': 6838,
 'sport object get': 6594,
 'object get ball': 4731,
 'get ball goal': 3196,
 'ball goal line': 546,
 'goal line goal': 3239,
 'line goal goalpost': 4115,
 'goal goalpost use': 3237,
 'goalpost use mere': 3267,
 'use mere bodi': 7320,
 'mere bodi carri': 4436,
 'bodi carri throw': 802,
 'carri throw kick': 1006,
 'unqualifi word': 7275,
 'word footbal': 7601,
 'footbal gener': 2659,
 'gener mean': 3183,
 'mean form': 4354,
 'form footbal': 2865,
 'footbal popular': 2754,
 'popular word': 5376,
 'word use': 7612,
 'unqualifi word footbal': 7276,
 'word footbal gener': 7

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True, ngram_range=(3,3))

x = cv.fit_transform(corpus)

In [40]:
cv.vocabulary_

{'footbal famili team': 1343,
 'famili team sport': 1176,
 'team sport object': 3536,
 'sport object get': 3416,
 'object get ball': 2434,
 'get ball goal': 1664,
 'ball goal line': 284,
 'goal line goal': 1686,
 'line goal goalpost': 2123,
 'goal goalpost use': 1685,
 'goalpost use mere': 1701,
 'use mere bodi': 3788,
 'mere bodi carri': 2282,
 'bodi carri throw': 412,
 'carri throw kick': 515,
 'unqualifi word footbal': 3766,
 'word footbal gener': 3928,
 'footbal gener mean': 1357,
 'gener mean form': 1658,
 'mean form footbal': 2243,
 'form footbal popular': 1485,
 'footbal popular word': 1418,
 'popular word use': 2769,
 'sport commonli call': 3404,
 'commonli call footbal': 740,
 'call footbal includ': 471,
 'footbal includ associ': 1369,
 'includ associ footbal': 1862,
 'associ footbal known': 195,
 'footbal known soccer': 1377,
 'known soccer australia': 2026,
 'soccer australia canada': 3361,
 'australia canada south': 226,
 'canada south africa': 494,
 'south africa unit': 33

# TF-IDF

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range=(3,3), max_features=3)

x = cv.fit_transform(corpus)

In [54]:
corpus[0]

'footbal famili team sport object get ball goal line goal goalpost use mere bodi carri throw kick'

In [55]:
x[0].toarray()

array([[0., 0., 0.]])