# Stemming

In [1]:
import nltk
from nltk.stem.porter import *

In [2]:
stemming = PorterStemmer()

In [3]:
words = ['run','runs','running','ran','easily','fairly','easy', 'fair','history','historical']

In [4]:
for word in words:
    print(word + '--->'+stemming.stem(word))

run--->run
runs--->run
running--->run
ran--->ran
easily--->easili
fairly--->fairli
easy--->easi
fair--->fair
history--->histori
historical--->histor


In [26]:
sentence = "In formally outlining the crux of the proposed Digital India Act, 2023, the Minister of State, IT, Rajeev Chandrasekhar, made a case for a robust replacement of the IT Act, 2000, which is somewhat obsolete now. He ominously added a question that the government sought to revisit: “should there be a ‘safe harbour’ at all for all intermediaries?” This acquires significance as the government has been working towards increasing the compliance burden on Internet intermediaries, in particular in the IT Rules 2021 and its later amendments. These Rules themselves had put the onus on social media intermediaries to arbitrate on content on their platforms with regulations that were weighted in favour of the government of the day, and had invited legal appeals as digital news media platforms among others questioned the constitutionality of the Rules. Meanwhile, an amendment in October 2022 provided for government-appointed committees that will adjudicate on an individual user’s appeals against moderation decisions of these intermediaries. In January 2023, the IT Ministry proposed an amendment on the take down of social media/news content that has been marked as “fake” or “false” by the Press Information Bureau or any other government agency. These, in sum, had already put the safe harbour protections for intermediaries at much risk."

In [6]:
from nltk.corpus import stopwords

In [27]:
my_review = nltk.sent_tokenize(sentence)

In [28]:
my_review

['In formally outlining the crux of the proposed Digital India Act, 2023, the Minister of State, IT, Rajeev Chandrasekhar, made a case for a robust replacement of the IT Act, 2000, which is somewhat obsolete now.',
 'He ominously added a question that the government sought to revisit: “should there be a ‘safe harbour’ at all for all intermediaries?” This acquires significance as the government has been working towards increasing the compliance burden on Internet intermediaries, in particular in the IT Rules 2021 and its later amendments.',
 'These Rules themselves had put the onus on social media intermediaries to arbitrate on content on their platforms with regulations that were weighted in favour of the government of the day, and had invited legal appeals as digital news media platforms among others questioned the constitutionality of the Rules.',
 'Meanwhile, an amendment in October 2022 provided for government-appointed committees that will adjudicate on an individual user’s appeal

In [13]:
stemming = PorterStemmer()

In [18]:
for i in range(len(my_review)):
    words = nltk.word_tokenize(my_review[i])
    words = [stemming.stem(word) for word in words if word not in set(stopwords.words('english'))]
    my_review[i] = ' '.join(words)

In [19]:
my_review

['formal outlin crux propo digit india act , 2023 , minist state , , rajeev chandrasekhar , made case robust replac act , 2000 , somewhat obsolet .',
 'omin ad question govern sought revisit : “ ‘ safe harbour ’ intermediari ? ” thi acquir signif govern ha work toward increa complianc burden internet intermediari , particular rule 2021 later amend .',
 'rule themselv put onu social media intermediari arbitr content platform regul weight favour govern day , invit legal appeal digit news media platform among question constitut rule .',
 'meanwhil , amend octob 2022 provid government-appoint committ adjud individu user ’ appeal moder deci intermediari .',
 'januari 2023 , ministri propo amend take social media/new content ha mark “ fake ” “ fal ” press inform bureau ani govern agenc .',
 ', sum , alreadi put safe harbour protect intermediari much risk .']

# Lemmatization

In [23]:
words = ['run','runs','running','ran','easily','fairly','easy', 'fair','history','historical']

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
lemm = WordNetLemmatizer()

In [25]:
for word in words:
    print(word + '--->'+lemm.lemmatize(word))

run--->run
runs--->run
running--->running
ran--->ran
easily--->easily
fairly--->fairly
easy--->easy
fair--->fair
history--->history
historical--->historical


In [29]:
for i in range(len(my_review)):
    words = nltk.word_tokenize(my_review[i])
    words = [lemm.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    my_review[i] = ' '.join(words)

In [30]:
my_review

['In formally outlining crux proposed Digital India Act , 2023 , Minister State , IT , Rajeev Chandrasekhar , made case robust replacement IT Act , 2000 , somewhat obsolete .',
 'He ominously added question government sought revisit : “ ‘ safe harbour ’ intermediary ? ” This acquires significance government working towards increasing compliance burden Internet intermediary , particular IT Rules 2021 later amendment .',
 'These Rules put onus social medium intermediary arbitrate content platform regulation weighted favour government day , invited legal appeal digital news medium platform among others questioned constitutionality Rules .',
 'Meanwhile , amendment October 2022 provided government-appointed committee adjudicate individual user ’ appeal moderation decision intermediary .',
 'In January 2023 , IT Ministry proposed amendment take social media/news content marked “ fake ” “ false ” Press Information Bureau government agency .',
 'These , sum , already put safe harbour protecti

# Bag of Words

In [56]:
sentence = "In formally outlining the crux of the proposed Digital India Act, 2023, the Minister of State, IT, Rajeev Chandrasekhar, made a case for a robust replacement of the IT Act, 2000, which is somewhat obsolete now. He ominously added a question that the government sought to revisit: “should there be a ‘safe harbour’ at all for all intermediaries?” This acquires significance as the government has been working towards increasing the compliance burden on Internet intermediaries, in particular in the IT Rules 2021 and its later amendments. These Rules themselves had put the onus on social media intermediaries to arbitrate on content on their platforms with regulations that were weighted in favour of the government of the day, and had invited legal appeals as digital news media platforms among others questioned the constitutionality of the Rules. Meanwhile, an amendment in October 2022 provided for government-appointed committees that will adjudicate on an individual user’s appeals against moderation decisions of these intermediaries. In January 2023, the IT Ministry proposed an amendment on the take down of social media/news content that has been marked as “fake” or “false” by the Press Information Bureau or any other government agency. These, in sum, had already put the safe harbour protections for intermediaries at much risk.Regulation of hate speech and disinformation on the Internet is a must and intermediaries, including digital news media and social media platforms, have an accountable role to play. The IT Rules’ specifications on giving users prior notice before removing content or disabling access, and for intermediaries to come up with periodic compliance reports are well taken. Social media intermediaries should not shut down users’ posts or communications except in the interests of public order and to avoid legal consequences. But care should be taken to ensure that requirements on intermediaries should not become needlessly onerous and punitive, which also vitiate the principle of safe harbour. There is a legitimate concern that the government is keener on regulating or taking down critical opinion or dissent in social media/news platforms than hate speech or disinformation, which in many cases has originated from representatives of the state. Safe harbour provisions, in particular Section 230 of the U.S. Communications Decency Act, 1996, that explicitly provided immunity to online services with respect to user-generated content had gone a long way in catalysing the Net’s development. While modern regulations to tackle issues related to misinformation, problematic content and the side effects of the new form of the Internet are a must, they should still retain first principles of safe harbour without whittling down their core."

### Pre-processing steps

In [59]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
my_review = nltk.sent_tokenize(sentence)

lemma = WordNetLemmatizer()
ps = PorterStemmer()

corpus = []

for i in range(len(my_review)):
    review = re.sub('[^a-zA-Z]',' ', my_review[i])
    review = review.lower()
    review = review.split()
    review = [lemma.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [60]:
corpus

['formally outlining crux proposed digital india act minister state rajeev chandrasekhar made case robust replacement act somewhat obsolete',
 'ominously added question government sought revisit safe harbour intermediary acquires significance government working towards increasing compliance burden internet intermediary particular rule later amendment',
 'rule put onus social medium intermediary arbitrate content platform regulation weighted favour government day invited legal appeal digital news medium platform among others questioned constitutionality rule',
 'meanwhile amendment october provided government appointed committee adjudicate individual user appeal moderation decision intermediary',
 'january ministry proposed amendment take social medium news content marked fake false press information bureau government agency',
 'sum already put safe harbour protection intermediary much risk regulation hate speech disinformation internet must intermediary including digital news medium so

# Building Bag of Words model

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)
x = cv.fit_transform(corpus).toarray()

In [63]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [65]:
import pandas as pd
df = pd.DataFrame(x)

In [67]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,162
0,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [68]:
df.shape

(12, 163)

In [None]:
# Bag of word 
# converting text to vector - 
    # binary bow
    # boolean bow
# creating more sparse matrix - zero value - dis-advantage
# we can not figure out which word is more significant - dis-advantage

# Text to Speech

In [84]:
# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

In [69]:
import os
os.getcwd()

'C:\\Users\\Lenovo\\Desktop\\AI&NLP_morning_8-10\\NLP'

In [1]:
#!pip install gTTS
# gTTS - google text to speech

Collecting gTTS
  Downloading gTTS-2.3.1-py3-none-any.whl (28 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.3.1


In [70]:
from gtts import gTTS

In [77]:
my_sentence = input('please enter your sentence here to convert into audio : \n>')

please enter your sentence here to convert into audio : 
>एक बार फिर से सजने वाला है इंडिया टुडे कॉन्क्लेव का वो मंच जिसका पूरा देश पूरी बेसब्री से इंतजार करता है. दिल्ली में 17 और 18 मार्च को इंडिया टुडे कॉन... https://www.aajtak.in/india/news/story/pm-narendra-modi-to-address-india-today-conclave-2023-in-delhi-ntc-1653190-2023-03-13


In [82]:
sentence = gTTS(text=my_sentence, lang='kn')

In [83]:
sentence.save('hindi_kannada_ascent.mp3')