<a href="https://colab.research.google.com/github/FaizaAli-Dev/Natural-language-Processing/blob/all-about-AI-and-Data-Science/NLP_Text_Preprocessing_using_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **NLP **

In [None]:
import os
import nltk
import random
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') ##contains the pre-trained English [Part-of-Speech (POS]](https://en.wikipedia.org/wiki/Part_of_speech) tagger in NLTK
nltk.download('maxent_ne_chunker')
nltk.download('words')
random.seed(92)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
corpus = str(["""Natural Language Processing helps us do just that!

Natural language processing (NLP) can be thought of as an intersection of Linguistics, Computer Science and Artificial Intelligence that helps computers understand, interpret and manipulate human language."""])

In [None]:
token_list = []
sentences = nltk.sent_tokenize(str(corpus))
for sentence in sentences:
  words = nltk.word_tokenize(sentence)
  token_list.extend(words)  #append() adds a single element to the end of the list while, extend() can add multiple individual elements to the end of the list
  tagged_words = nltk.pos_tag(words)
  named_entities = nltk.ne_chunk(tagged_words)

print(token_list)
print ('\nLength of word_list:', len(token_list))

['[', "'Natural", 'Language', 'Processing', 'helps', 'us', 'do', 'just', 'that', '!', '\\n\\nNatural', 'language', 'processing', '(', 'NLP', ')', 'can', 'be', 'thought', 'of', 'as', 'an', 'intersection', 'of', 'Linguistics', ',', 'Computer', 'Science', 'and', 'Artificial', 'Intelligence', 'that', 'helps', 'computers', 'understand', ',', 'interpret', 'and', 'manipulate', 'human', 'language', '.', "'", ']']

Length of word_list: 44


In [None]:
word_list = corpus.split()
print(word_list)
print ('\nLength of word_list:', len(word_list))

["['Natural", 'Language', 'Processing', 'helps', 'us', 'do', 'just', 'that!\\n\\nNatural', 'language', 'processing', '(NLP)', 'can', 'be', 'thought', 'of', 'as', 'an', 'intersection', 'of', 'Linguistics,', 'Computer', 'Science', 'and', 'Artificial', 'Intelligence', 'that', 'helps', 'computers', 'understand,', 'interpret', 'and', 'manipulate', 'human', "language.']"]

Length of word_list: 34


In [None]:
alphabets_only = [word for word in word_list if word.isalpha()]
print(alphabets_only)
print ('\nLength of word_list:', len(alphabets_only))

['Language', 'Processing', 'helps', 'us', 'do', 'just', 'language', 'processing', 'can', 'be', 'thought', 'of', 'as', 'an', 'intersection', 'of', 'Computer', 'Science', 'and', 'Artificial', 'Intelligence', 'that', 'helps', 'computers', 'interpret', 'and', 'manipulate', 'human']

Length of word_list: 28


Text Normalization

In [None]:
lower_case = [word.lower() for word in alphabets_only]
print(lower_case)
print ('\nLength of word_list:', len(lower_case))

['language', 'processing', 'helps', 'us', 'do', 'just', 'language', 'processing', 'can', 'be', 'thought', 'of', 'as', 'an', 'intersection', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'that', 'helps', 'computers', 'interpret', 'and', 'manipulate', 'human']

Length of word_list: 28


In [None]:
from nltk.corpus import stopwords

stopwords_nltk = set(stopwords.words('english'))
print(stopwords_nltk)
print ('\nLength of word_list:', len(stopwords_nltk), '\n')

cleaned_words = [word for word in lower_case if word not in stopwords_nltk]
print(cleaned_words)
print ('\nLength of word_list:', len(cleaned_words))

{'if', 'hadn', "wasn't", 'm', 'of', 'very', 'do', "hadn't", 'don', 'more', 'he', 'to', 'each', 'only', 'not', 'are', 'but', "hasn't", 'been', 'ours', 'as', 'won', 'nor', 'own', 'once', 'be', 'this', 'other', 'hers', "didn't", 'hasn', 'about', 'again', 'y', 'no', 've', 're', 'doing', 'ourselves', 'while', 'above', "don't", 'did', 'yourself', 'during', "you'd", "you'll", 'him', 'didn', 'yourselves', 'a', 'so', 'our', "shan't", 'what', 'by', 'they', 'into', 'off', 't', "mightn't", 'up', 's', "shouldn't", 'until', 'd', 'will', 'himself', 'couldn', 'isn', 'it', 'between', 'an', "isn't", "she's", 'shan', 'ain', 'same', 'over', 'why', 'which', "it's", 'or', 'who', 'them', 'before', 'when', 'your', 'she', 'all', 'the', 'through', 'weren', 'on', 'where', "weren't", 'has', 'after', 'such', 'both', 'its', 'than', 'in', 'wouldn', 'any', 'you', 'these', 'being', "couldn't", 'does', 'her', 'those', 'because', 'aren', 'itself', 'against', "haven't", 'now', 'ma', 'have', 'most', 'whom', 'under', 'am',

In [None]:
stop_words = stopwords.words('english')  # Get the list of English stopwords
print(stop_words)
available_languages = stopwords.fileids()
print(available_languages)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '