In [2]:
# problem Statement:- Extract Sample document and apply following document preprocessing methods: Tokenization, POS 
# Tagging, stop words removal, Stemming and Lemmatization.
# Create representation of documents by calculating Term Frequency and Inverse Document Frequency

In [3]:
# Notes:-

# Tokenization is the process of breaking down a text into smaller units called tokens.
# These tokens can be words, sentences, or even subwords, depending on the level of granularity you need
# for your analysis.

In [4]:
# 1) Word tokenization:- Imagine if you have the chunk of text like paragraph
# work tokenize is like breaking this text into smaller pieces.

# 2) Sentence Tokenization: Similar to word tokenization, but instead of breaking
# the text into individual words, you're breaking it into sentences.

# 3) POS Tagging:- POS Tagging (Part-of-Speech Tagging): In natural language processing,
# each word in a sentence can be categorized into a particular part of speech, such as noun, verb, adjective, etc.
# For example, in the sentence "The cat is sleeping", POS tagging would label "The" as a determiner, "cat" as a noun, "is" as a verb, and "sleeping" as a verb.

# 4) Stemming:- Stemming: Stemming is the process of reducing words to their root or base form, which allows different
# variations of the same word to be treated as the same word. For example, "running", "runs", and "ran"

In [5]:
# In summary, word tokenization breaks text into individual words, sentence tokenization breaks text into individual sentences, 
# POS tagging assigns grammatical labels to words in a sentence, and stemming reduces words to their base form. 
# These are all important preprocessing steps in natural language processing tasks.
# NLTK library is used for the tokenization.

In [8]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
block = "Dr. D. Y. Patil Institute of Engineering, Management, and Research."
print("This is word-wise tokenization-:\n", nltk.word_tokenize(block), '\n')
print("----------------------------------------------------------\n")
print("This is sentence-wise tokenization-:\n", nltk.sent_tokenize(block))




This is word-wise tokenization-:
 ['Dr.', 'D.', 'Y.', 'Patil', 'Institute', 'of', 'Engineering', ',', 'Management', ',', 'and', 'Research', '.'] 

----------------------------------------------------------

This is sentence-wise tokenization-:
 ['Dr. D. Y. Patil Institute of Engineering, Management, and Research.']


In [7]:
from nltk.corpus import stopwords

nltk.download("stopwords")
# `stop_words = stopwords.words('english')` is creating a list of English stopwords using the NLTK library.
# Stopwords are common words (such as 'and', 'the', 'is', etc.) that are often filtered out from text
# data during natural language processing tasks like text analysis or text mining. These words are
# considered to be non-informative and are typically removed to focus on the more meaningful words
# in the text.
stop_words = stopwords.words("english")
print(stop_words)
token = nltk.word_tokenize(block)
cleaned_token = []
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("This is the unclean version-:", "\n", token, "\n")
print("----------------------------------------------------------------------'\n'")
print("This is the cleaned version-:", "\n", cleaned_token)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.stem import PorterStemmer
stemmer = nltk.PorterStemmer()
words = ['rain', 'rained', 'raining', 'rains']
stemmed = [stemmer.stem(word) for word in words]
print(stemmed)


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')#data dependencies
nltk.download('omw-1.4')
lemmatizer = nltk.WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in cleaned_token]
print(lemmatized)


In [None]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
tagged = nltk.pos_tag(cleaned_token)
print(tagged)

In [None]:
import pandas as pd
import sklearn as sk
import math


In [None]:
block_1 = "Our aim is to develop a good work culture among students, a culture where students from various technical backgrounds come together to teach,guide and collaborate with each other on various projects and grow together."
block_2 = "Keeping in mind the interest of the IT professionals and computer enthusiasts, CSI works towards making the profession an area of choice amongst all sections of the society. The promotion of Information Technology as a profession is the top priority of CSI today. To fulfill this objective,the CSI regularly organizes conferences, conventions, lectures, projects,and awards. And at the same time, it also ensures that regular training and skill updating are organized for the future IT professionals."
#split so each word have their own string
first_block = block_1.split(" ")
second_block = block_2.split(" ")
#join them to remove common duplicate words
total= set(first_block).union(set(second_block))
print(total)

In [None]:
wordDictA = dict.fromkeys(total, 0)
wordDictB = dict.fromkeys(total, 0)
for word in first_block:
    wordDictA[word]+=1
for word in second_block:
    wordDictB[word]+=1


In [None]:
pd.DataFrame([wordDictA, wordDictB])

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in wordDictA if not w in stop_words]
print(filtered_sentence)


In [None]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():tfDict[word] = count/float(corpusCount)
    return(tfDict)
#running our sentences through the tf function:
tfFirst = computeTF(wordDictA, first_block)
tfSecond = computeTF(wordDictB, second_block)
tf = pd.DataFrame([tfFirst, tfSecond])
print(tf)

In [None]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items(): idfDict[word] = math.log10(N /(float(val) + 1))
    return(idfDict)

idfs = computeIDF([wordDictA, wordDictB])
idfs1 = pd.DataFrame([wordDictA, wordDictB])
print(idfs1)

In [None]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items(): tfidf[word] = val*idfs[word]
    return(tfidf)
#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)
#putting it in a dataframe
idf= pd.DataFrame([idfFirst, idfSecond])
print(idf)


In [None]:
#The WordNet is a part of Python's Natural Language Toolkit. It is a large word database of English Nouns, Adjectives, Adverbs and Verbs. 
#

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Make sure all words are in lowercase
version_1 = "Developing a competitive culture where the students polish technical and professional attributes, gain experience and learn new skills while upgrading the already present skillset. For those fledglings who have a zeal to build a strong profile and are hunting for their Ikigai, CSI provides ample opportunities for those individuals too."
version_2 = "Personalized career guidance, Regular Logic and aptitude building activities, Industrial level project collaboration, Building a network with active collaborations across the globe, Periodic member exclusive conferences and seminars, Created a community for sharing skills and knowledge"
#calling the TfidfVectorizer
vectorize= TfidfVectorizer()
#fitting the model and passing our sentences right away:
response= vectorize.fit_transform([version_1.lower(), version_2.lower()])           

In [None]:
 print(response)