In [11]:
# TF-Term frequency, IDF-inverse document frequency
import re
import nltk

In [12]:
paragraph = """The Department of Atomic Energy and DRDO had this tremendous partnership in the recent nuclear tests, on May 11 and 13. This was the third bliss. The joy of participating with my team in these nuclear tests and proving to the world that India can make it, that we are no longer a developing nation but one of them. It made me feel very proud as an Indian. The fact that we have now developed for Agni a re-entry structure, for which we have developed this new material. A very light material called carbon-carbon.

One day an orthopedic surgeon from Nizam Institute of Medical Sciences visited my laboratory. He lifted the material and found it so light that he took me to his hospital and showed me his patients. There were these little girls and boys with heavy metallic calipers weighing over three kilograms each, dragging their feet around.

He said to me: Please remove the pain of my patients. In three weeks, we made these floor reaction orthosis 300-gram calipers and took them to the orthopedic center. The children didn’t believe their eyes. From dragging around a three kg load on their legs, they could now move around! Their parents had tears in their eyes"""

In [13]:
# cleaning the texts
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [14]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)

In [15]:
len(sentences)

14

In [22]:
corpus = []
for i in range(len(sentences)):
    review = re.sub("[^a-zA-Z]"," ", sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    corpus.append(review)

In [23]:
corpus

['department atomic energy drdo tremendous partnership recent nuclear test may',
 'third bliss',
 'joy participating team nuclear test proving world india make longer developing nation one',
 'made feel proud indian',
 'fact developed agni entry structure developed new material',
 'light material called carbon carbon',
 'one day orthopedic surgeon nizam institute medical science visited laboratory',
 'lifted material found light took hospital showed patient',
 'little girl boy heavy metallic caliper weighing three kilogram dragging foot around',
 'said please remove pain patient',
 'three week made floor reaction orthosis gram caliper took orthopedic center',
 'child believe eye',
 'dragging around three kg load leg could move around',
 'parent tear eye']

In [24]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [25]:
X

array([[0.        , 0.        , 0.32447314, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.28574533],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.56645127, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [28]:
import pandas as pd
import numpy as np
df = pd.DataFrame(X)
pd.pandas.set_option('display.max_columns', None)

In [29]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83
0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.324473,0.324473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.0,0.0,0.280836,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.280836,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285745,0.0,0.0,0.285745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285745,0.0,0.285745,0.0,0.0,0.0,0.0,0.0,0.285745,0.0,0.0,0.247316,0.247316,0.0,0.0,0.0,0.0,0.285745,0.0,0.0,0.0,0.0,0.285745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285745,0.0,0.247316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285745
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.322866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645731,0.0,0.0,0.0,0.0,0.322866,0.0,0.322866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248637,0.0,0.0,0.0,0.0,0.0,0.322866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397083,0.794166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343681,0.0,0.0,0.0,0.0,0.0,0.305791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.324473,0.0,0.280836,0.280836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.324473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324473,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382349,0.0,0.0,0.0,0.382349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382349,0.330928,0.0,0.0,0.0,0.0,0.0,0.294444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330928,0.0,0.0,0.0,0.0,0.0
8,0.0,0.262876,0.0,0.0,0.0,0.303723,0.262876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303723,0.0,0.303723,0.0,0.303723,0.0,0.0,0.0,0.0,0.0,0.0,0.303723,0.0,0.0,0.0,0.0,0.303723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233895,0.0,0.0,0.0,0.0,0.303723,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458874,0.0,0.0,0.0,0.397162,0.458874,0.0,0.0,0.0,0.0,0.458874,0.458874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
