In [75]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from string import punctuation
from nltk.corpus import wordnet

`nltk.corpus` needed for  nltk.pos_tag function <br>
`nltk.stem` needed for WordNetLemmatizer

```python
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
```

In [3]:
text = '''Harshad Shantilal Mehta (29 July 1954 — 31 December 2001) was an Indian stockbroker. Mehta's involvement in the 1992 Indian securities scam made him infamous as a market manipulator.Of the 27 criminal charges brought against Mehta, he was only convicted of four, before his death (by sudden heart attack) at age 47 in 2001. It was alleged that Mehta engaged in a massive stock manipulation scheme financed by worthless bank receipts, which his firm brokered for "ready forward" transactions between banks. Mehta was convicted by the Bombay High Court and the Supreme Court of India for his part in a financial scandal valued at ₹100 billion (US$1.3 billion) which took place on the Bombay Stock Exchange (BSE). The scandal exposed the loopholes in the Indian banking system and the Bombay Stock Exchange (BSE) transaction system, and consequently the SEBI introduced new rules to cover those loopholes. He was on trial for 9 years, until he died at the end of 2001 from heart attack.'''

## Tokenizing

In [4]:
tokens = nltk.tokenize.word_tokenize(text)

In [5]:
tokens[:25]

['Harshad',
 'Shantilal',
 'Mehta',
 '(',
 '29',
 'July',
 '1954',
 '—',
 '31',
 'December',
 '2001',
 ')',
 'was',
 'an',
 'Indian',
 'stockbroker',
 '.',
 'Mehta',
 "'s",
 'involvement',
 'in',
 'the',
 '1992',
 'Indian',
 'securities']

In [6]:
sent_tokens = nltk.tokenize.sent_tokenize(text)

In [7]:
sent_tokens[:7]

['Harshad Shantilal Mehta (29 July 1954 — 31 December 2001) was an Indian stockbroker.',
 "Mehta's involvement in the 1992 Indian securities scam made him infamous as a market manipulator.Of the 27 criminal charges brought against Mehta, he was only convicted of four, before his death (by sudden heart attack) at age 47 in 2001.",
 'It was alleged that Mehta engaged in a massive stock manipulation scheme financed by worthless bank receipts, which his firm brokered for "ready forward" transactions between banks.',
 'Mehta was convicted by the Bombay High Court and the Supreme Court of India for his part in a financial scandal valued at ₹100 billion (US$1.3 billion) which took place on the Bombay Stock Exchange (BSE).',
 'The scandal exposed the loopholes in the Indian banking system and the Bombay Stock Exchange (BSE) transaction system, and consequently the SEBI introduced new rules to cover those loopholes.',
 'He was on trial for 9 years, until he died at the end of 2001 from heart at

In [8]:
lowercase_tokens = [i.lower() for i in tokens]

In [9]:
lowercase_tokens[:25]

['harshad',
 'shantilal',
 'mehta',
 '(',
 '29',
 'july',
 '1954',
 '—',
 '31',
 'december',
 '2001',
 ')',
 'was',
 'an',
 'indian',
 'stockbroker',
 '.',
 'mehta',
 "'s",
 'involvement',
 'in',
 'the',
 '1992',
 'indian',
 'securities']

In [10]:
bigrams = nltk.ngrams(text.split(),2)

In [11]:
list(bigrams)

[('Harshad', 'Shantilal'),
 ('Shantilal', 'Mehta'),
 ('Mehta', '(29'),
 ('(29', 'July'),
 ('July', '1954'),
 ('1954', '—'),
 ('—', '31'),
 ('31', 'December'),
 ('December', '2001)'),
 ('2001)', 'was'),
 ('was', 'an'),
 ('an', 'Indian'),
 ('Indian', 'stockbroker.'),
 ('stockbroker.', "Mehta's"),
 ("Mehta's", 'involvement'),
 ('involvement', 'in'),
 ('in', 'the'),
 ('the', '1992'),
 ('1992', 'Indian'),
 ('Indian', 'securities'),
 ('securities', 'scam'),
 ('scam', 'made'),
 ('made', 'him'),
 ('him', 'infamous'),
 ('infamous', 'as'),
 ('as', 'a'),
 ('a', 'market'),
 ('market', 'manipulator.Of'),
 ('manipulator.Of', 'the'),
 ('the', '27'),
 ('27', 'criminal'),
 ('criminal', 'charges'),
 ('charges', 'brought'),
 ('brought', 'against'),
 ('against', 'Mehta,'),
 ('Mehta,', 'he'),
 ('he', 'was'),
 ('was', 'only'),
 ('only', 'convicted'),
 ('convicted', 'of'),
 ('of', 'four,'),
 ('four,', 'before'),
 ('before', 'his'),
 ('his', 'death'),
 ('death', '(by'),
 ('(by', 'sudden'),
 ('sudden', 'heart'

## Removing stop words

In [12]:
stop_words = nltk.corpus.stopwords.words('english')

In [13]:
punct = list(punctuation)

In [14]:
cleaned_tokens = [i for i in lowercase_tokens if i not in stop_words and i not in punct]

In [15]:
cleaned_tokens[:25]

['harshad',
 'shantilal',
 'mehta',
 '29',
 'july',
 '1954',
 '—',
 '31',
 'december',
 '2001',
 'indian',
 'stockbroker',
 'mehta',
 "'s",
 'involvement',
 '1992',
 'indian',
 'securities',
 'scam',
 'made',
 'infamous',
 'market',
 'manipulator.of',
 '27',
 'criminal']

## Stemming

In [16]:
ps = nltk.stem.PorterStemmer()

In [17]:
ps.stem('jumping')

'jump'

In [18]:
ps.stem('lately')

'late'

In [19]:
ps.stem('assess')

'assess'

In [20]:
ps.stem('ran')

'ran'

In [21]:
stems = [ps.stem(i) for i in cleaned_tokens]

In [22]:
stems[:25]

['harshad',
 'shantil',
 'mehta',
 '29',
 'juli',
 '1954',
 '—',
 '31',
 'decemb',
 '2001',
 'indian',
 'stockbrok',
 'mehta',
 "'s",
 'involv',
 '1992',
 'indian',
 'secur',
 'scam',
 'made',
 'infam',
 'market',
 'manipulator.of',
 '27',
 'crimin']

## Lemmatization

In [23]:
lmt = nltk.WordNetLemmatizer()

In [24]:
lmt.lemmatize('starting','v')

'start'

In [25]:
lmt.lemmatize('better','a')

'good'

In [26]:
lmt.lemmatize('ran','v')

'run'

In [27]:
lemma = [lmt.lemmatize(i) for i in cleaned_tokens]

In [28]:
lemma[:25]

['harshad',
 'shantilal',
 'mehta',
 '29',
 'july',
 '1954',
 '—',
 '31',
 'december',
 '2001',
 'indian',
 'stockbroker',
 'mehta',
 "'s",
 'involvement',
 '1992',
 'indian',
 'security',
 'scam',
 'made',
 'infamous',
 'market',
 'manipulator.of',
 '27',
 'criminal']

### POS tagging

In [30]:
tagged = nltk.pos_tag(cleaned_tokens)

In [37]:
def pos_tag_simplified(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [41]:
simple_tagged = list(map(lambda x:(x[0],pos_tag_simplified(x[1])),tagged))

In [44]:
simple_tagged[:5]

[('harshad', 'n'),
 ('shantilal', 'n'),
 ('mehta', 'v'),
 ('29', None),
 ('july', 'n')]

In [48]:
lema = [lmt.lemmatize(i[0],i[1]) if i[1] != None else lmt.lemmatize(i[0]) for i in simple_tagged]

In [49]:
lema

['harshad',
 'shantilal',
 'mehta',
 '29',
 'july',
 '1954',
 '—',
 '31',
 'december',
 '2001',
 'indian',
 'stockbroker',
 'mehta',
 "'s",
 'involvement',
 '1992',
 'indian',
 'security',
 'scam',
 'make',
 'infamous',
 'market',
 'manipulator.of',
 '27',
 'criminal',
 'charge',
 'bring',
 'mehta',
 'convict',
 'four',
 'death',
 'sudden',
 'heart',
 'attack',
 'age',
 '47',
 '2001',
 'allege',
 'mehta',
 'engage',
 'massive',
 'stock',
 'manipulation',
 'scheme',
 'finance',
 'worthless',
 'bank',
 'receipt',
 'firm',
 'broker',
 '``',
 'ready',
 'forward',
 "''",
 'transaction',
 'bank',
 'mehta',
 'convict',
 'bombay',
 'high',
 'court',
 'supreme',
 'court',
 'india',
 'part',
 'financial',
 'scandal',
 'value',
 '₹100',
 'billion',
 'u',
 '1.3',
 'billion',
 'take',
 'place',
 'bombay',
 'stock',
 'exchange',
 'bse',
 'scandal',
 'expose',
 'loophole',
 'indian',
 'banking',
 'system',
 'bombay',
 'stock',
 'exchange',
 'bse',
 'transaction',
 'system',
 'consequently',
 'sebi',


# Bag of Words

In [51]:
sentence1 = 'This is a good job. I will not miss it for anything'
sentence2 = 'This is not good at all'

In [52]:
c_vec = CountVectorizer(ngram_range=(1,1),# to use bigrams use (2,2)
                       stop_words='english')

In [67]:
c_vec = CountVectorizer(ngram_range=(1,1),# to use bigrams use (2,2)
                       )

In [63]:
c_vec = CountVectorizer(ngram_range=(2,2),# to use bigrams use (2,2)
                       )

In [71]:
c_vec = CountVectorizer(ngram_range=(1,2),# to use bigrams use (2,2)
                       )

## transform

In [72]:
c_data = c_vec.fit_transform([sentence1,sentence2])

## Creating data frame

In [73]:
cv_df = pd.DataFrame(c_data.toarray(),columns=c_vec.get_feature_names_out())

In [74]:
cv_df

Unnamed: 0,all,anything,at,at all,for,for anything,good,good at,good job,is,...,job will,miss,miss it,not,not good,not miss,this,this is,will,will not
0,0,1,0,0,1,1,1,0,1,1,...,1,1,1,1,0,1,1,1,1,1
1,1,0,1,1,0,0,1,1,0,1,...,0,0,0,1,1,0,1,1,0,0


# TF - IDF

In [76]:
sentence1 = 'This is a good job. I will not miss it for anything'
sentence2 = 'This is not good at all'

In [77]:
tf_idf_vec = TfidfVectorizer(use_idf=True,
                            ngram_range=(1,1)) # to use only bigrams use (2,2)

In [78]:
tf_idf_data = tf_idf_vec.fit_transform([sentence1,sentence2])

In [79]:
tf_idf_df = pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())

In [80]:
tf_idf_df

Unnamed: 0,all,anything,at,for,good,is,it,job,miss,not,this,will
0,0.0,0.353003,0.0,0.353003,0.251164,0.251164,0.353003,0.353003,0.353003,0.251164,0.251164,0.353003
1,0.498446,0.0,0.498446,0.0,0.354649,0.354649,0.0,0.0,0.0,0.354649,0.354649,0.0
