In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WhitespaceTokenizer
import re
from nltk.probability import FreqDist

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
para = """ Major Dhyan Chand (29 August 1905 – 3 December 1979) was an Indian field hockey player widely regarded as one of the greatest field hockey player in history.
[4][5][6] He was known for his extraordinary ball control and goal-scoring feats, in addition to earning three Olympic gold medals, in 1928, 1932 and 1936, during an era where India dominated field hockey.
His influence extended beyond these victories, as India won the field hockey event in seven out of eight Olympics from 1928 to 1964.
Known as The Wizard[8][9] or The Magician[10][11] of hockey for his superb ball control, Chand played internationally from 1926 to 1949 where he scored 570 goals in 185 matches according to his autobiography, Goal[12][13] and over 1000 goals in his entire domestic and international career.
[14] BBC called him the "hockey's equivalent of Muhammad Ali".[14] The Government of India awarded Chand India's third highest civilian honour of Padma Bhushan in 1956.
[15] His birthday, 29 August, is celebrated as National Sports Day in India every year. India's highest sporting honour Major Dhyan Chand Khel Ratna Award is named after him"""

In [None]:
stem = PorterStemmer()
tokenizer = WhitespaceTokenizer()

In [None]:
def preprocess(x):
  x = x.lower()
  x = re.sub('^[a-z0-9]','',x)
  x = re.sub('\[\d+\]','',x)
  x = re.sub('chand','',x)
  x = re.sub('dhyan','',x)
  x = re.sub('major','',x)
  x = tokenizer.tokenize(x)
  x = [word for word in x if not word in stopwords.words()]
  return x

In [None]:
words = preprocess(para)

In [None]:
words

['(29',
 'august',
 '1905',
 '–',
 '3',
 'december',
 '1979)',
 'indian',
 'field',
 'hockey',
 'player',
 'widely',
 'regarded',
 'greatest',
 'field',
 'hockey',
 'player',
 'history.',
 'known',
 'extraordinary',
 'ball',
 'control',
 'goal-scoring',
 'feats,',
 'addition',
 'earning',
 'three',
 'olympic',
 'gold',
 'medals,',
 '1928,',
 '1932',
 '1936,',
 'india',
 'dominated',
 'field',
 'hockey.',
 'influence',
 'extended',
 'beyond',
 'victories,',
 'india',
 'field',
 'hockey',
 'event',
 'seven',
 'eight',
 'olympics',
 '1928',
 '1964.',
 'known',
 'wizard',
 'magician',
 'hockey',
 'superb',
 'ball',
 'control,',
 'played',
 'internationally',
 '1926',
 '1949',
 'scored',
 '570',
 'goals',
 '185',
 'matches',
 'according',
 'autobiography,',
 'goal',
 '1000',
 'goals',
 'entire',
 'domestic',
 'international',
 'career.',
 'bbc',
 'called',
 '"hockey\'s',
 'equivalent',
 'muhammad',
 'ali".',
 'government',
 'india',
 'awarded',
 "india's",
 'third',
 'highest',
 'civilian',

In [None]:
fdist = FreqDist(words)
fdist.most_common(10)

[('field', 4),
 ('hockey', 4),
 ('india', 4),
 ('player', 2),
 ('known', 2),
 ('ball', 2),
 ('goals', 2),
 ("india's", 2),
 ('highest', 2),
 ('honour', 2)]

In [None]:
n_gram_tokens = tokenizer.tokenize(para)
n_gram_tokens

['Major',
 'Dhyan',
 'Chand',
 '(29',
 'August',
 '1905',
 '–',
 '3',
 'December',
 '1979)',
 'was',
 'an',
 'Indian',
 'field',
 'hockey',
 'player',
 'widely',
 'regarded',
 'as',
 'one',
 'of',
 'the',
 'greatest',
 'field',
 'hockey',
 'player',
 'in',
 'history.',
 '[4][5][6]',
 'He',
 'was',
 'known',
 'for',
 'his',
 'extraordinary',
 'ball',
 'control',
 'and',
 'goal-scoring',
 'feats,',
 'in',
 'addition',
 'to',
 'earning',
 'three',
 'Olympic',
 'gold',
 'medals,',
 'in',
 '1928,',
 '1932',
 'and',
 '1936,',
 'during',
 'an',
 'era',
 'where',
 'India',
 'dominated',
 'field',
 'hockey.',
 'His',
 'influence',
 'extended',
 'beyond',
 'these',
 'victories,',
 'as',
 'India',
 'won',
 'the',
 'field',
 'hockey',
 'event',
 'in',
 'seven',
 'out',
 'of',
 'eight',
 'Olympics',
 'from',
 '1928',
 'to',
 '1964.',
 'Known',
 'as',
 'The',
 'Wizard[8][9]',
 'or',
 'The',
 'Magician[10][11]',
 'of',
 'hockey',
 'for',
 'his',
 'superb',
 'ball',
 'control,',
 'Chand',
 'played',
 

In [None]:
def trigram(tokens):
  for i in range(len(tokens) -3 +1):
    grams = tokens[i : i+3]
    print(grams)

In [None]:
n_gram = trigram(n_gram_tokens)
n_gram

['Major', 'Dhyan', 'Chand']
['Dhyan', 'Chand', '(29']
['Chand', '(29', 'August']
['(29', 'August', '1905']
['August', '1905', '–']
['1905', '–', '3']
['–', '3', 'December']
['3', 'December', '1979)']
['December', '1979)', 'was']
['1979)', 'was', 'an']
['was', 'an', 'Indian']
['an', 'Indian', 'field']
['Indian', 'field', 'hockey']
['field', 'hockey', 'player']
['hockey', 'player', 'widely']
['player', 'widely', 'regarded']
['widely', 'regarded', 'as']
['regarded', 'as', 'one']
['as', 'one', 'of']
['one', 'of', 'the']
['of', 'the', 'greatest']
['the', 'greatest', 'field']
['greatest', 'field', 'hockey']
['field', 'hockey', 'player']
['hockey', 'player', 'in']
['player', 'in', 'history.']
['in', 'history.', '[4][5][6]']
['history.', '[4][5][6]', 'He']
['[4][5][6]', 'He', 'was']
['He', 'was', 'known']
['was', 'known', 'for']
['known', 'for', 'his']
['for', 'his', 'extraordinary']
['his', 'extraordinary', 'ball']
['extraordinary', 'ball', 'control']
['ball', 'control', 'and']
['control', 'a

In [None]:
def sentiment_preprocess(x):
  x = x.lower()
  x = re.sub('^[a-z0-9]','',x)
  x = re.sub('\[\d+\]','',x)
  x = re.sub('chand','',x)
  x = re.sub('dhyan','',x)
  x = re.sub('major','',x)
  x = tokenizer.tokenize(x)
  x = [word for word in x if not word in stopwords.words()]
  x = ' '.join(x)
  return x

In [None]:
from textblob import TextBlob

In [None]:
sentiment_para = sentiment_preprocess(para)
sentiment_para

'(29 august 1905 – 3 december 1979) indian field hockey player widely regarded greatest field hockey player history. known extraordinary ball control goal-scoring feats, addition earning three olympic gold medals, 1928, 1932 1936, india dominated field hockey. influence extended beyond victories, india field hockey event seven eight olympics 1928 1964. known wizard magician hockey superb ball control, played internationally 1926 1949 scored 570 goals 185 matches according autobiography, goal 1000 goals entire domestic international career. bbc called "hockey\'s equivalent muhammad ali". government india awarded india\'s third highest civilian honour padma bhushan 1956. birthday, 29 august, celebrated national sports day india every year. india\'s highest sporting honour khel ratna award named'

In [None]:
blob = TextBlob(sentiment_para)

In [None]:
blob.sentiment

Sentiment(polarity=0.25833333333333336, subjectivity=0.4875)