In [1]:
!python --version   # Python version

# About python:  https://www.python.org/
#                Python is powerful... and fast; plays well with others; runs everywhere; is friendly & easy to learn;
#                is Open –> https://www.python.org/about/.
#     Python docs: https://docs.python.org/3/ (all documentation);
#                  https://docs.python.org/3.7/ (Recommended version – 3.7).
# The Python Tutorial (python3.7): https://docs.python.org/3.7/tutorial/index.html

Python 3.10.12


# [Sentiment Analysis](https://www.nltk.org/howto/sentiment.html) on [NLTK (Natural Language Toolkit)](https://www.nltk.org/)'s [Twitter Samples](https://www.nltk.org/howto/twitter.html#Extracting-Parts-of-a-Tweet)

### train model

#### load modules

In [2]:
# load modules
import re,string                                                                # load re — Regular expression operations and string for string manipulation
import numpy as np                                                              # load numerical python
#from matplotlib import pyplot as plt                                            # load plotting module

#### download nltk data

See for more - [installing NLTK Data](https://www.nltk.org/data.html)

In [3]:
# download nltk data
nltk_root=__import__('nltk')                                                    # load nltk
nltk_root.download('twitter_samples')                                           # download twitter samples
nltk_root.download('punkt')                                                     # download punctuations for tokenization
nltk_root.download('stopwords')                                                 # download all stopwords
nltk_root.download('wordnet')                                                   # download wordnet data for wordnet WordNetLemmatizer
nltk_root.download('averaged_perceptron_tagger')                                # download data for nltk POS tagger
nltk_root.download('tagsets')                                                   # download tagset info (nltk.help.upenn_tagset)
del nltk_root

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


#### load data -  "[Twitter Samples](https://www.nltk.org/howto/twitter.html#Extracting-Parts-of-a-Tweet)"

load data as dataframe -> https://www.nltk.org/howto/twitter.html#Using-Dataframes

In [4]:
# load data - twitter samples
from nltk.corpus import twitter_samples

In [5]:
# load twitter sample (unsupervised)
unsupervised:list=twitter_samples.strings(fileids='tweets.20150430-223406.json')
# see total samples
print('Total samples:',len(unsupervised))
# get random 5 samples index
unsupervised_indexs=np.random.randint(low=0,high=len(unsupervised),size=5)
# see some (file) random samples
for index in unsupervised_indexs:
  print(f'Sample {index}: {unsupervised[index]}')                               # make print sample

Total samples: 20000
Sample 17919: RT @b12snp: What a stupid man he has just handed initiative 
to the SNP.Now he will have to call on us with
(Cap in hand) https://t.co/Mi8W…
Sample 3503: RT @ScotNational: A bit late tonight .. Miliband prefers Cameron as PM to working with SNP. Tommorrow's front page from @scotnational http:…
Sample 10900: Left-wingers in the #Farage audience can't bring themselves to applaud the call to look after homeless veterans!
#VoteUKIP
Sample 8666: RT @ShinyAlex: Lest we forget that Nigel Farage is an anagram of Anal Fire Egg #ukip #AskNigelFarage
Sample 4564: Ask Farage on BBC1 now. Should be a regular thing.


In [6]:
# load positive twitter samples
positive:list=twitter_samples.strings(fileids='positive_tweets.json')
# see total samples
print('Total samples:',len(positive))
# get random 5 samples index
positive_indexs=np.random.randint(low=0,high=len(positive),size=5)
# see some (file) random samples
for index in positive_indexs:
  print(f'Sample {index}: {positive[index]}')                                   # make print sample

Total samples: 5000
Sample 3447: Google has made Narendra Modi really very sad about Imran Khan not becoming Prime Minister. :p
Sample 1911: @LiamMitch99 Really glad to hear it. :) If you have anymore problems or need us we're only a tweet away. ^LB
Sample 253: #FollowFriday @CCIFCcanada @AdamEvnmnt @boxcalf1 for being top engaged members in my community this week :)
Sample 612: Thank you @LaurieTatum2 as always! At last I’m back on track :) Hope you have a fantastic weekend! :) http://t.co/25fUKonLoM
Sample 2087: @HyundaiIndia #CretaPerfect4Me #CRETAperfectSUV Creta Has Won Loads Of Hearts In Some Days, Many More Hearts To Be Won :)


In [7]:
# load negative twitter samples
negative:list=twitter_samples.strings(fileids='negative_tweets.json')
# see total samples
print('Total samples:',len(negative))
# get random 5 samples index
negative_indexs=np.random.randint(low=0,high=len(negative),size=5)
# see some (file) random samples
for index in negative_indexs:
  print(f'Sample {index}: {negative[index]}')                                   # make print sample

Total samples: 5000
Sample 3042: Nice whilst it lasted :( #AFLBluesHawks
Sample 944: Justin where are you ? :( @justinbieber
Sample 1920: @AllicioGloria She's gone :(, Hello, we offer you free perfume samples and Chanel/Burberry/Prada giveaway on our site! Please check our bio.
Sample 1640: So this Canadian family had the best dinner of their holiday last night @biabistrot. Well done! Enjoyed ours too but we're not on holiday :(
Sample 2763: Soft defence by the best defensive team there :( #NRLTigersRoosters


#### look / search for special characters

In [8]:
# make a empty string to store all special characters in positive string
positive_special_characters=''
# get special charaters in positive tweets
for sample in positive:
  # get all special characters from sample
  positive_special_characters+=re.sub('[\w \t\n]','',sample)
  # or #positive_special_characters=positive_special_characters+re.sub('[\w ]','',sample)

# make a empty string to store all special characters in positive string
negative_special_characters=''
# get special charaters in positive tweets
for sample in negative:
  # get all special characters from sample
  negative_special_characters+=re.sub('[\w \t\n]','',sample)
  # or #negative_special_characters=negative_special_characters+re.sub('[\w ]','',sample)

# see all special characters
print('In positive tweets:',positive_special_characters)
print('In negative tweets:',negative_special_characters)

In positive tweets: #@@@:)@!:/:)!@:).?!@:)!!!:)@@:)#://./'!!!-:)://./@,’:)..,:,-.…:.#@@:)'........:)-://./://./@@&;://./:)@,':)!:)://./-:-)'...:)###.@@:'@:))??—:)://./#@@@:)@'.,#':)@!:)):)&;&;@@💅🏽💋-:))))'@@@@!:-)@......:)()-:)-():...://./@!@.':)@.:-#######:)://./@:)))))#,!:)[]##!@@?:)&;&;:-)@@&;://./:)#@@:):-)@@@@@#:-)@:@@,!!:-)@.:@.:)@://./:)-/#:)!@:)@@@,:!@;),.:).@@,:)#!:)://./#@!:@:)#@@@:)':)@@,.'.:-),,.:)@?:)@@@@@,🍭:).&;:)#://./@:):)@:"@:!:)"@@&;://./:)@@,!!!:@:)#://./@!':),?:))@:).:)://./.@'...:)#@@@:)#@:),,,…://./:):@@:—,:,...://./:)://./!://./.:)@'..#:-)#@@@:):):)@:!@@?:)&;&;!#&;@&;@:://./@:)&;:)))#:@!@://./:)://./:)&;&;@@@!!:)@@:))://./@://./:)-/#@@@:)@@&;://./:)':).@,':)://./:):-)@!:))))))))).:)://./.!:@@'-!!:@:)@:)://./:).:)://./.#@@@:)@@:@!!.:)@:)@:-),:)!@..:)@':):://./#@@@:)@@@@@:).!:-)@💪,:)",:)"'.!@@?:)&;&;@??!:)!!@@&;://./:)@!@?'!:),...,!!..;..-:))://./,,,:)://./@@@@@!:)@@':-):)://./#@@@:):)@@@@@@@@@@:)@:))@%:)@:@:)@:)#@@@:)@,:).:-)@():)@,...,!:):)🚂(@()-@,)://./@:)@'::):)

In [9]:
# make dictionary to store all special characters counts
positive_special_counts:dict=dict()
# get special charaters in positive tweets
for sample in positive:
  # get all special characters from sample
  specials:str=re.sub('[\w]','',sample)
  # get each special character in string of special characters
  for special in re.split('[\n\t ]+',specials):
    # check special character is present or not
    if special in positive_special_counts:
      positive_special_counts[special]+=1
    else: # if special character is not present
      positive_special_counts[special]=1

# make dictionary to store all special characters counts
negative_special_counts:dict=dict()
# get special charaters in negative tweets
for sample in negative:
  # get all special characters from sample
  specials:str=re.sub('[\w]','',sample)
  # get each special character in string of special characters
  for special in re.split('[\n\t ]+',specials):
    # check special character is present or not
    if special in negative_special_counts:
      negative_special_counts[special]+=1                                       # increase count by one
    else:                                                                       # if special character is not present
      negative_special_counts[special]=1                                        # create key value pair where key -> special character and value -> 1

# sort dictionary by counts
positive_special_counts=dict(sorted(positive_special_counts.items(),key=lambda kv: kv[1],reverse=True))
negative_special_counts=dict(sorted(negative_special_counts.items(),key=lambda kv: kv[1],reverse=True))

# see counts
print('Positive: ',positive_special_counts)
print('Negative: ',negative_special_counts)

Positive:  {'@': 4991, ':)': 3275, '': 2036, "'": 1241, '#': 1205, '!': 1191, '.': 1190, '://./': 1103, ':': 947, ',': 894, ':-)': 633, '?': 414, '&;': 311, '-': 230, '...': 177, '!!': 129, ':))': 119, '..': 104, '"': 95, ')': 81, '://./.': 66, '/': 47, '!!!': 45, ':)))': 38, '(': 37, '"@:': 32, '....': 32, '&;&;': 29, '*': 29, '—': 27, '""': 25, '’': 21, '??': 21, ';)': 21, ':))))': 20, '#!': 20, '!:)': 20, '^': 20, ':).': 19, ':)"': 17, '#…': 17, ':-))': 17, '…': 16, '()': 14, '@.': 14, ':)))))': 14, '~': 14, '–': 14, '.....': 14, '@,': 12, '+': 12, '?:)': 12, '!!!!': 12, '#.': 11, "''": 11, '@!': 11, '^^': 11, '♥': 10, '://./"': 10, '???': 9, ':*': 9, '..:)': 9, '#?': 9, ':-)))': 9, '←': 9, '%': 8, ':),': 8, ';': 8, '✧': 8, '?!': 7, '.:)': 7, '[]': 7, '=': 7, '**': 7, '.@': 7, ':):):)': 7, ':.': 6, '......': 6, ':))))))': 6, '|': 6, '👉': 6, '❤️': 6, ':/': 5, ':))))))))': 5, '$': 5, '."': 5, "...'": 5, '--': 5, ':-': 5, '...:)': 5, ':-*': 5, '@:': 5, '😂': 5, ':-).': 5, '🍰': 5, '#,': 

#### clean data function

```python
# see PoS tag for first five positive sentences
for index,sample in enumerate(positive[:5]):
  # make clean sample
  clean_sample=cleaner(positive[0],mode='words',apply_stemmer=False)
  # make PoS tag each word and get word tag only and add to set
  print(index,':',pos_tag(clean_sample))
```
```shell
0 : [('followfriday', 'RB'), ('engaged', 'VBD'), ('members', 'NNS'), ('community', 'NN'), ('week', 'NN'), ('positive', 'JJ')]
1 : [('james', 'NNS'), ('please', 'VBP'), ('call', 'VB'), ('contact', 'NN'), ('centre', 'NN'), ('able', 'JJ'), ('assist', 'JJ'), ('positive', 'JJ'), ('many', 'JJ'), ('thanks', 'NNS')]
2 : [('listen', 'VBN'), ('last', 'JJ'), ('night', 'NN'), ('positive', 'JJ'), ('bleed', 'NN'), ('amazing', 'VBG'), ('track', 'NN'), ('scotland', 'NN')]
3 : [('congrats', 'NNS'), ('positive', 'JJ')]
4 : [('yeaaaah', 'NN'), ('yippppy', 'JJ'), ('accnt', 'NN'), ('verified', 'VBD'), ('rqst', 'JJ'), ('succeed', 'VB'), ('blue', 'JJ'), ('tick', 'JJ'), ('mark', 'NN'), ('profile', 'NN'), ('positive', 'JJ'), ('days', 'NNS')]
```

``` python
# make set to collect all PoS tags from samples
tags_set=set()

# collect all unique tags in positive sentences (one by one)
for sample in positive:
  # make clean sample
  clean_sample:list=cleaner(sample,mode='words',apply_stemmer=False)
  # make PoS tag each word and get word tag only and add to set
  tags_set=tags_set.union(tagged_word_tuple[1] for tagged_word_tuple in pos_tag(clean_sample))

# collect all unique tags in negative sentences (one by one)
for sample in negative:
  # make clean sample
  clean_sample:list=cleaner(sample,mode='words',apply_stemmer=False)
  # make PoS tag each word and get word tag only and add to set
  tags_set=tags_set.union(tagged_word_tuple[1] for tagged_word_tuple in pos_tag(clean_sample))

# counts total taggs
print('Total unique tags:',len(tags_set))
print('See all tags (sorted):',sorted(tags_set))
```
```shell
Total unique tags: 29
See all tags (sorted): ['CC', 'CD', 'DT', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'PRP', 'RB', 'RBR', 'RBS', 'RP', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
```
```python
# get help on tag (use to define POS tag)
#from nltk.help import upenn_tagset
# as ->
#print(upenn_tagset('VBZ'))
```

```python
# load word tokenizer and sentence tokenizer
from nltk.tokenize import word_tokenize,sent_tokenize

# take sample corpus
corpus='''Python is a high-level, general-purpose programming language. Its design philosophy
          emphasizes code readability with the use of significant indentation via the off-side rule.

          Python is dynamically typed and garbage-collected. It supports multiple programming paradigms,
           including structured (particularly procedural), object-oriented and functional programming.
          It is often described as a "batteries included" language due to its comprehensive standard
          library. '''

# break paragraph (string) to sentences
sentences:list=sent_tokenize(corpus)                                            # return list of sentences
# see (print all senetences)
for index,sentence in enumerate(sentences):
  print(f'Line {index}:',sentence.replace("\n",""))
```
```shell
Line 0: Python is a high-level, general-purpose programming language.
Line 1: Its design philosophy           emphasizes code readability with the use of significant indentation via the off-side rule.
Line 2: Python is dynamically typed and garbage-collected.
Line 3: It supports multiple programming paradigms,           including structured (particularly procedural), object-oriented and functional programming.
Line 4: It is often described as a "batteries included" language due to its comprehensive standard           library.
```
```python
# break paragraph (string) to words  
words:list=word_tokenize(corpus)                                                # return list of words
print('Total words:',len(words))                                                # total words
print('See first 15 words:',words[:15])
```
```shell
Total words: 69
See first 15 words: ['Python', 'is', 'a', 'high-level', ',', 'general-purpose', 'programming', 'language', '.', 'Its', 'design', 'philosophy', 'emphasizes', 'code', 'readability']
```

In [10]:
# load word tokenizer
from nltk.tokenize import word_tokenize
# load stopwords module
from nltk.corpus import stopwords
# load porter stemmer class
from nltk.stem import PorterStemmer
# load wordnet lemmatizer
from nltk.stem import WordNetLemmatizer
# load POS (Part of Speach) tagger
from nltk import pos_tag

# make cleaner function
def cleaner(inputs:str,specials:dict={':)':' positive ',':-)':' positive ',':(':' negative ',':-(':' negative '},
            tags2remove:str=r'http[\S]+|@[\S]+|&amp[\S]+',punctuation2remove:str=r'[\d]|[^\w ]',
            minimum_length_of_word:int=3,apply_stemmer:bool=True,apply_lemmatizer:bool=True,mode:str='string',):#'words'

  ''' inputs (str) : string or list of string to clean
      specials (dict) : default `{':)':' positive ',':-)':' positive ',':(':' negative ',':-(':' negative '}` .
        dictionay withe key of special strings (charaters) to be replaced with value (word).
      tags2remove (str) : default `r'http[\S]+|@[\S]+|&amp[\S]+'` . String of special sequences to be removed
        (like links, hash-tags, etc.).
      punctuation2remove (str) : default `r'[\d]|[^\w ]'` . String used by `re.sub` to clean strings.
      minimum_length_of_word (int) : default 3. Minimum length of words, above which are only allowed.
      apply_stemmer (bool) : default True. To apply stemmer or not (False).
      apply_lemmatizer (bool) : default True. To apply lemmatizer or not (False).
      mode (str) : default `'string'`. To return clean string as string or list of words (`mode='words'`)
  '''

  # if inputs is list or tuple of string (make function recursion)
  if isinstance(inputs,(tuple,list)):
    # get a string from list / tuple of strings
    return list(cleaner(input_,specials,tags2remove,punctuation2remove,
                        minimum_length_of_word,apply_stemmer,apply_lemmatizer,
                        mode) for input_ in inputs)                             # make return clean data

  # make porter stemmer object
  stemmer=PorterStemmer()
  # make wordnet lemmatizer object
  lemmatizer=WordNetLemmatizer()

  # make replace specials (special charaters having meanings)
  for key,value in specials.items():
    if key in inputs:                                                           # if particular special character is present
      inputs:str=inputs.replace(key,value)                                      # make replacement

  # convert to lowercase (forcefully)
  inputs:str=inputs.casefold()
  # remove unwanted tags (like links and @tags)
  inputs:str=re.sub(tags2remove,'',inputs)
  # remove all punctuations and digits
  inputs:str=re.sub(punctuation2remove,' ',inputs)
  # tokenize sentence to words (word_tokenize)
  inputs:list=word_tokenize(inputs)                                             # return list of words
  # remove all stopwords
  inputs:list=[word for word in inputs if word not in stopwords.words(fileids='english')]
  # remove words have very samll lengths
  inputs:list=[word for word in inputs if len(word)>minimum_length_of_word]
  # make stem words (clean words) (if true)
  if apply_stemmer: inputs:list=[stemmer.stem(word) for word in inputs]
  # lemmatizer list of words (if true)
  if apply_lemmatizer:
    # The Part Of Speech tag. Valid options are "n" for nouns, "v" for verbs, "a" for adjectives, "r" for adverbs and "s"
    # for satellite adjectives (NOT in nltk.pos_tag).
    lemmatized_inputs:list=list()                                               # make empty list to collect all lemmatized words
    for tagged_word_tuple in pos_tag(inputs):
      if 'v'.casefold() in tagged_word_tuple[1][0].casefold():                  # tags found (for verbs) -> 'VB', 'VBD', 'VBG', 'VBN', 'VBP', and 'VBZ'
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='v'))# "v" for verbs
      elif 'j'.casefold() in tagged_word_tuple[1][0].casefold():                # tags found (for adjectives) -> 'JJ', 'JJR' and 'JJS'
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='a'))# "a" for adjectives
      elif 'r'.casefold() in tagged_word_tuple[1][0].casefold():                # tags found (for adverbs) -> 'RB', 'RBR', 'RBS' and 'RP'
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='r'))# "r" for adverbs
      else:                                                                     # remining all tags are considered as noun
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='n'))# "n" for nouns
      inputs:list=lemmatized_inputs                                             # make return cleaned lemmatized inputs
  # join all words to form string and return (cleaned value)
  # check function mode - if is in string mode (join cleaned words to form string)
  if mode=='words'.casefold(): return inputs                                    # return list of words (cleaned)
  else: return ' '.join(inputs)                                                 # return string (join cleaned words )

In [11]:
# make sample clean random 5 (positive) samples index - see some (file) random samples
for index in positive_indexs:
  print(f'Sample {index}:\n\tOrginal: {positive[index]} \n\tCleaned: {cleaner(positive[index])}')                                   # make print sample

Sample 3447:
	Orginal: Google has made Narendra Modi really very sad about Imran Khan not becoming Prime Minister. :p 
	Cleaned: googl make narendra modi realli imran khan becom prime minist
Sample 1911:
	Orginal: @LiamMitch99 Really glad to hear it. :) If you have anymore problems or need us we're only a tweet away. ^LB 
	Cleaned: realli glad hear posit anymor problem need tweet away
Sample 253:
	Orginal: #FollowFriday @CCIFCcanada @AdamEvnmnt @boxcalf1 for being top engaged members in my community this week :) 
	Cleaned: followfriday engag member commun week posit
Sample 612:
	Orginal: Thank you @LaurieTatum2 as always! At last I’m back on track :) Hope you have a fantastic weekend! :) http://t.co/25fUKonLoM 
	Cleaned: thank alway last back track posit hope fantast weekend posit
Sample 2087:
	Orginal: @HyundaiIndia #CretaPerfect4Me #CRETAperfectSUV Creta Has Won Loads Of Hearts In Some Days, Many More Hearts To Be Won :) 
	Cleaned: cretaperfect cretaperfectsuv creta load heart day ma

In [12]:
# make sample clean random 5 (negative) samples index - see some (file) random samples
for index in negative_indexs:
  print(f'Sample {index}:\n\tOrginal: {negative[index]} \n\tCleaned: {cleaner(negative[index])}')                                   # make print sample

Sample 3042:
	Orginal: Nice whilst it lasted :( #AFLBluesHawks 
	Cleaned: nice whilst last neg aflblueshawk
Sample 944:
	Orginal: Justin where are you ? :( @justinbieber 
	Cleaned: justin neg
Sample 1920:
	Orginal: @AllicioGloria She's gone :(, Hello, we offer you free perfume samples and Chanel/Burberry/Prada giveaway on our site! Please check our bio. 
	Cleaned: go neg hello offer free perfum sampl chanel burberri prada giveaway site plea check
Sample 1640:
	Orginal: So this Canadian family had the best dinner of their holiday last night @biabistrot. Well done! Enjoyed ours too but we're not on holiday :( 
	Cleaned: canadian famili best dinner holiday last night well do enjoy holiday neg
Sample 2763:
	Orginal: Soft defence by the best defensive team there :( #NRLTigersRoosters 
	Cleaned: soft defenc best defens team neg nrltigersroost


#### make preprocessing functions / generators

```python
# sample generator (function)
def generator(samples:list):

  ''' return each item / element of list '''

  # get a sample from lsit of samples
  for sample in samples:
    # return sample  
    yield sample

# make generator
gen=generator(['make','preprocessing','functions','generators'])
# call geneartor (also done by using loop, call one more tha length of list!)
print('Call 1st time:',next(gen))
print('Call 2nd time:',next(gen))
print('Call 3rd time:',next(gen))
print('Call 4th time:',next(gen))
#print('Call 5th time:',next(gen))# ERROR -> StopIteration (as all the element in list exhausted / ended)
# NOTE: to use generator again make it again (new)
gen=generator(['make','preprocessing','functions','generators'])
# make use generator with loop
for index,element in enumerate(gen):
  print(index,':',element)
# Thus loop automatically stops with got "StopIteration" error !!!!!!!!!
```
```shell
Call 1st time: make
Call 2nd time: preprocessing
Call 3rd time: functions
Call 4th time: generators
0 : make
1 : preprocessing
2 : functions
3 : generators

```

In [13]:
# load word tokenizer
#from nltk.tokenize import word_tokenize

# make generator to return token (word) from smaple
def token_generator(samples:list):

  ''' smaples -> list of sentences '''

  # get sample (sentence) form samples (list of senetences)
  for sample in samples:
    # if sample is string (sentence) than break it into list of words
    if isinstance(sample,str):sample:list=word_tokenize(sample)
    # get word by word from list of words
    for word in sample:
      # yield word by word
      yield word


# make generator to convert list of senetences to list of dictionary where key is token (word) and value is True
def processCleanedDataGen(samples:list):

  ''' smaples -> list of sentences '''

  # get sample (sentence) form samples (list of senetences)
  for sample in samples:
    # if sample is string (sentence) than break it into list of words
    if isinstance(sample,str):sample:list=word_tokenize(sample)
    # return whole senetence as dictionay where key -> token (word) and value -> True
    yield {word:True for word in sample}

# make tryout of token_generator -> list of sentences to word generator
#token_gen=token_generator(cleaner([['followfriday engag member commun week posit']]))# or
#token_gen=token_generator(cleaner(positive[:1]))# or
token_gen=token_generator(cleaner(positive[0:1]))
# get all words yielded by generator
for word in token_gen:
  print(word,end=', ')                                                          # make print all in one line
print('\n')                                                                     # get to next line

#  make tryout of token_generator -> list of sentences to sentences dictionary
processed_data_gen=processCleanedDataGen(cleaner(positive[0:2]))                # get first two senetences
# get all senetences yielded by generator
for index,sentence in enumerate(processed_data_gen):
  print(index,':',sentence)

followfriday, engag, member, commun, week, posit, 

0 : {'followfriday': True, 'engag': True, 'member': True, 'commun': True, 'week': True, 'posit': True}
1 : {'jame': True, 'plea': True, 'call': True, 'contact': True, 'centr': True, 'abl': True, 'assist': True, 'posit': True, 'mani': True, 'thank': True}


#### make fequency dictionary

to count number of occurance of each word

In [14]:
# RERUN THIS SHELL IF YOUR GENRATORS ARE DISTORTED / exhausted -> REMAKE generators
# make word generator for all postive words
poistiveGenerator=token_generator(cleaner(positive,apply_stemmer=False))
print(f'type of postive_word_gen_ -> {poistiveGenerator}:',type(poistiveGenerator))

# make frequency dict
# A frequency distribution for the outcomes of an experiment. A frequency distribution records the number of times each outcome of
# an experiment has occurred.  For example, a frequency distribution could be used to record the frequency of each word type in a
# document.  Formally, a frequency distribution can be defined as a function mapping from each sample to the number of times that
# sample occurred as an outcome.
#from nltk import FreqDist #or
from nltk.probability import FreqDist

# make postive words frequency dictionary
positive_words_frequency=FreqDist(poistiveGenerator)
# see -> frequency dict
print(positive_words_frequency)
print('Frequency of top 10 most common elements [positive]:\n',positive_words_frequency.most_common(10),sep='')
type(positive_words_frequency) # to see resutl -> positive_freq_dict_ -> return dict

type of postive_word_gen_ -> <generator object token_generator at 0x7b1ab67d5770>: <class 'generator'>
<FreqDist with 5717 samples and 26292 outcomes>
Frequency of top 10 most common elements [positive]:
[('positive', 4408), ('thanks', 394), ('follow', 369), ('love', 340), ('thank', 250), ('good', 245), ('like', 234), ('happy', 199), ('great', 176), ('back', 163)]


nltk.probability.FreqDist

In [15]:
# RERUN THIS SHELL IF YOUR GENRATORS ARE DISTORTED / exhausted -> REMAKE generators
# make word generator for all negative words
poistiveGenerator=token_generator(cleaner(negative,apply_stemmer=False))
print(f'type of postive_word_gen_ -> {poistiveGenerator}:',type(poistiveGenerator))

# make frequency dict
# A frequency distribution for the outcomes of an experiment. A frequency distribution records the number of times each outcome of
# an experiment has occurred.  For example, a frequency distribution could be used to record the frequency of each word type in a
# document.  Formally, a frequency distribution can be defined as a function mapping from each sample to the number of times that
# sample occurred as an outcome.
#from nltk import FreqDist #or
from nltk.probability import FreqDist

# make postive words frequency dictionary
negative_words_frequency=FreqDist(poistiveGenerator)
# see -> frequency dict
print(negative_words_frequency)
print('Frequency of top 10 most common elements [negative]:\n',negative_words_frequency.most_common(10),sep='')
type(negative_words_frequency) # to see resutl -> negative_freq_dict_ -> return dict

type of postive_word_gen_ -> <generator object token_generator at 0x7b1ab67d5b60>: <class 'generator'>
<FreqDist with 5515 samples and 25786 outcomes>
Frequency of top 10 most common elements [negative]:
[('negative', 5144), ('miss', 299), ('please', 274), ('follow', 260), ('want', 241), ('like', 229), ('time', 163), ('sorry', 149), ('go', 147), ('love', 142)]


nltk.probability.FreqDist

#### make preprocess data

In [16]:
# make dataset for model (modeled data)
poistive_words4Data=processCleanedDataGen(cleaner(positive,apply_stemmer=False))
negative_words4Data=processCleanedDataGen(cleaner(negative,apply_stemmer=False))
# see (first sample)
print('Sample: ',next(processCleanedDataGen(cleaner(positive[:1],apply_stemmer=False))))

# make dataset -
positive_dataset=[(tweet_dict,'Positive') for tweet_dict in poistive_words4Data] # list of tuples where t[0]-> dict, t[1]-> 'string'
negative_dataset=[(tweet_dict,'Negative') for tweet_dict in negative_words4Data]
# merge dataset
dataset=positive_dataset+negative_dataset
print('Size of dataset:',len(dataset)) # LEN->10_000
# split data set -> 90:10
from sklearn.model_selection import train_test_split
# split data
train_set,test_set=train_test_split(dataset,test_size=0.10)
# see size
len(train_set),len(test_set) # NOT THIS ALSO

Sample:  {'followfriday': True, 'engage': True, 'member': True, 'community': True, 'week': True, 'positive': True}
Size of dataset: 10000


(9000, 1000)

In [17]:
# see first sample (final data)
[(tweet_dict,'Positive') for tweet_dict in processCleanedDataGen(cleaner(positive[:1],apply_stemmer=False))][0]

({'followfriday': True,
  'engage': True,
  'member': True,
  'community': True,
  'week': True,
  'positive': True},
 'Positive')

#### training and testing model - [NaiveBayesClassifier](https://scikit-learn.org/stable/modules/naive_bayes.html)

to-read: https://ocw.mit.edu/courses/sloan-school-of-management/15-097-prediction-machine-learning-and-statistics-spring-2012/lecture-notes/MIT15_097S12_lec07.pdf

to-read: https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-034-artificial-intelligence-fall-2010/tutorials/MIT6_034F10_tutor06.pdf

In [18]:
# import module
from nltk import classify,NaiveBayesClassifier
# train maodel
nbclassifier=NaiveBayesClassifier.train(train_set)
# test model
print('Accuracy score:',round(classify.accuracy(nbclassifier,test_set)*100,3))

Accuracy score: 97.7


In [19]:
# classifier 10 most informative features
nbclassifier.show_most_informative_features(10)

Most Informative Features
                negative = True           Negati : Positi =   2980.9 : 1.0
                positive = True           Positi : Negati =    837.4 : 1.0
                negative = None           Positi : Negati =    116.6 : 1.0
                follower = True           Positi : Negati =     30.1 : 1.0
               community = True           Positi : Negati =     21.6 : 1.0
                    glad = True           Positi : Negati =     16.1 : 1.0
               goodnight = True           Positi : Negati =     14.9 : 1.0
             opportunity = True           Positi : Negati =     14.3 : 1.0
                    tire = True           Negati : Positi =     13.9 : 1.0
                  arrive = True           Positi : Negati =     13.8 : 1.0


### evaluate (test) on 2023 most famous tweets

In [20]:
# source: https://twitter.com/BarackObama/status/896523232098078720
tweet=''' No one is born hating another person because of the
color of his skin or his background or his religion...
'''#BarackObama

# make classiftoken_generatoriy sentence
nbclassifier.classify(dict([(word,True) for word in cleaner(tweet,mode='words')]))

'Positive'

**install googletrans** - Googletrans is a free and unlimited python library that implemented Google Translate API. This uses the [Google Translate Ajax API](https://translate.google.com/) to make calls to such methods as detect and translate. Compatible with Python 3.6+. For details refer to the [API Documentation](https://py-googletrans.readthedocs.io/en/latest). **[NOT WORKING]**



In [21]:
# make install - https://github.com/terryyin/translate-python
!python -m pip install -qq translate

In [22]:
# load module translator
from translate import Translator
# make translator : Languages : https://py-googletrans.readthedocs.io/en/latest/#googletrans-languages
translator=Translator(to_lang='en',from_lang='hi')
# make translation
translation=translator.translate("यह एक पायथन भाषा है।")
# make print results
print(f'"यह एक पायथन भाषा है।" -> {translation}')

"यह एक पायथन भाषा है।" -> این یک زبان پایتون است.


In [23]:
tweet='''संकट कितना भी बड़ा हो, विपत्ति कितनी भी बड़ी हो, स्वामी जी के लिए मानवीय संवेदनाएं हमेशा सर्वोच्च रहीं।
अक्षरधाम पर आतंकी हमले के बाद जब मैंने स्वामी जी को फोन किया तो उनकी बात सुनकर आश्चर्य में पड़ गया…'''#narendramodi

# make translator : Languages : https://py-googletrans.readthedocs.io/en/latest/#googletrans-languages
translator=Translator(to_lang='en',from_lang='hi')

# make classifiy sentence
nbclassifier.classify(dict([(word,True) for word in cleaner(translator.translate(tweet),mode='words')]))

'Negative'

### save model and functions to local disk

In [24]:
# load pickle
import pickle

# make save model
with open('nltk.nb.model',mode='wb') as model_file:
  # make save model
  pickle.dump(nbclassifier,model_file)

### deploy model

make files (".py" and model file)

In [25]:
%%file main
import re                                                                       # load re — Regular expression operations
from nltk.tokenize import word_tokenize                                         # load word tokenizer
from nltk.corpus import stopwords                                               # load stopwords module
from nltk.stem import PorterStemmer                                             # load porter stemmer class
from nltk.stem import WordNetLemmatizer                                         # load wordnet lemmatizer
from nltk import pos_tag                                                        # load POS (Part of Speach) tagger

# make cleaner function
def cleaner(inputs:str,specials:dict={':)':' positive ',':-)':' positive ',':(':' negative ',':-(':' negative '},
            tags2remove:str=r'http[\S]+|@[\S]+|&amp[\S]+',punctuation2remove:str=r'[\d]|[^\w ]',
            minimum_length_of_word:int=3,apply_stemmer:bool=True,apply_lemmatizer:bool=True,mode:str='string',):#'words'

  ''' inputs (str) : string or list of string to clean
      specials (dict) : default `{':)':' positive ',':-)':' positive ',':(':' negative ',':-(':' negative '}` .
        dictionay withe key of special strings (charaters) to be replaced with value (word).
      tags2remove (str) : default `r'http[\S]+|@[\S]+|&amp[\S]+'` . String of special sequences to be removed
        (like links, hash-tags, etc.).
      punctuation2remove (str) : default `r'[\d]|[^\w ]'` . String used by `re.sub` to clean strings.
      minimum_length_of_word (int) : default 3. Minimum length of words, above which are only allowed.
      apply_stemmer (bool) : default True. To apply stemmer or not (False).
      apply_lemmatizer (bool) : default True. To apply lemmatizer or not (False).
      mode (str) : default `'string'`. To return clean string as string or list of words (`mode='words'`)
  '''


  if isinstance(inputs,(tuple,list)):                                           # if inputs is list or tuple of string (make function recursion)
    # get a string from list / tuple of strings
    return list(cleaner(input_,specials,tags2remove,punctuation2remove,minimum_length_of_word,apply_stemmer,apply_lemmatizer,
                        mode) for input_ in inputs)                             # make return clean data

  stemmer=PorterStemmer()                                                       # make porter stemmer object
  lemmatizer=WordNetLemmatizer()                                                # make wordnet lemmatizer object

  # make replace specials (special charaters having meanings)
  for key,value in specials.items():
    if key in inputs:                                                           # if particular special character is present
      inputs:str=inputs.replace(key,value)                                      # make replacement

  inputs:str=inputs.casefold()                                                  # convert to lowercase (forcefully)
  inputs:str=re.sub(tags2remove,'',inputs)                                      # remove unwanted tags (like links and @tags)
  inputs:str=re.sub(punctuation2remove,' ',inputs)                              # remove all punctuations and digits
  inputs:list=word_tokenize(inputs)# tokenize sentence to words (word_tokenize) -> return list of words
  inputs:list=[word for word in inputs if word not in stopwords.words(fileids='english')]# remove all stopwords
  inputs:list=[word for word in inputs if len(word)>minimum_length_of_word]     # remove words have very samll lengths
  if apply_stemmer: inputs:list=[stemmer.stem(word) for word in inputs]         # make stem words (clean words) (if true)
  if apply_lemmatizer:                                                          # lemmatizer list of words (if true)
    # The Part Of Speech tag. Valid options are "n" for nouns, "v" for verbs, "a" for adjectives, "r" for adverbs and "s"
    # for satellite adjectives (NOT in nltk.pos_tag).
    lemmatized_inputs:list=list()                                               # make empty list to collect all lemmatized words
    for tagged_word_tuple in pos_tag(inputs):
      if 'v'.casefold() in tagged_word_tuple[1][0].casefold():                  # tags found (for verbs) -> 'VB', 'VBD', 'VBG', 'VBN', 'VBP', and 'VBZ'
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='v'))# "v" for verbs
      elif 'j'.casefold() in tagged_word_tuple[1][0].casefold():                # tags found (for adjectives) -> 'JJ', 'JJR' and 'JJS'
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='a'))# "a" for adjectives
      elif 'r'.casefold() in tagged_word_tuple[1][0].casefold():                # tags found (for adverbs) -> 'RB', 'RBR', 'RBS' and 'RP'
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='r'))# "r" for adverbs
      else:                                                                     # remining all tags are considered as noun
        lemmatized_inputs.append(lemmatizer.lemmatize(tagged_word_tuple[0],pos='n'))# "n" for nouns
      inputs:list=lemmatized_inputs                                             # make return cleaned lemmatized inputs
  # join all words to form string and return (cleaned value)
  # check function mode - if is in string mode (join cleaned words to form string)
  if mode=='words'.casefold(): return inputs                                    # return list of words (cleaned)
  else: return ' '.join(inputs)                                                 # return string (join cleaned words )

if __name__=='__main__':                                                        # call program under main scope
  import sys                                                                    # load sys module
  import os                                                                     # load os module
  import pickle                                                                 # load pickle

  model_path='nltk.nb.model'                                                    # define model path
  if os.path.exists(model_path):                                                # if file exists
    with open(model_path,mode='rb') as nbclassifier_file_object:                # read model file
      nbclassifier=pickle.load(nbclassifier_file_object)                        # make load model
    # make pass input string (sys.argv[1]) to model (after preprocessing) and get output
    output=nbclassifier.classify(dict([(word,True) for word in cleaner(sys.argv[1],mode='words')]))
    print(f'The given string -> "{sys.argv[1]}" is "{output}"')                 # make print result
  else: print(f'Failed to load model! No model found at path -> "{model_path}"')# if model not at given path

Writing main


In [26]:
# how to run (required -> 'main' (file) and 'nltk.nb.model' (model file))
!python main "This is a wonderful state, blessed with exceptional natural beauty and hardworking people."
# by PM Narendra Modi

The given string -> "This is a wonderful state, blessed with exceptional natural beauty and hardworking people." is "Positive"


# References / Further reading

* [Official python docs](https://docs.python.org/3/)
* [Official python tutorials](https://docs.python.org/3/tutorial/index.html)
* [NLTK :: Natural Language Toolkit](https://www.nltk.org/)
* [Natural Language Processing with Python – Analyzing Text with the Natural Language Toolkit](https://www.nltk.org/book/), a book by Steven Bird, Ewan Klein, and Edward Loper.
* [Accessing Text Corpora and Lexical Resources](https://www.nltk.org/book/ch02.html)
* [spaCy · Industrial-strength Natural Language Processing](https://spacy.io/), Docs - https://spacy.io/api/doc , Example - https://spacy.io/api/example
* [spaCy 101: Everything you need to know](https://spacy.io/usage/spacy-101)
* NLTK's [NaiveBayesClassifier](https://scikit-learn.org/stable/modules/naive_bayes.html)
* [Example usage of NLTK modules](https://www.nltk.org/howto.html)
* [What is WordNet?](https://wordnet.princeton.edu/) and synsets - [Sample usage for wordnet](https://www.nltk.org/howto/wordnet.html)
