### Natural Language Processing (NLP)

In [4]:
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS,CountVectorizer,TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier

In [8]:
doc1='f5o@od is # good & good!'
doc2='& Food # is * tasty'
doc3='Quality is Good'
doc4='food is not good'
doc5='servi89ce is Poor poor means very poor'
doc6='it is to_o costly'
doc7='che^ap quality'
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
target=['pos','pos','pos','neg','neg','neg','neg']
print(corpus)

['f5o@od is # good & good!', '& Food # is * tasty', 'Quality is Good', 'food is not good', 'servi89ce is Poor poor means very poor', 'it is to_o costly', 'che^ap quality']


## Text Preprocessing

In [25]:
def cleaning(doc):
    doc=doc.lower()
    doc=re.sub('[^a-z ]','',doc)
    sp=list(ENGLISH_STOP_WORDS)
    sp.remove('not')
    wordslist=doc.split()
    newdoc=''
    for word in wordslist:
        if word not in sp:
            newdoc=newdoc+word+' '
    return newdoc.strip()

corpus1=list(map(cleaning,corpus))
cv=CountVectorizer()
X=cv.fit_transform(corpus1)
print(cv.get_feature_names_out())
X1=X.toarray()
model=RandomForestClassifier()
model.fit(X1,target)

['cheap' 'costly' 'food' 'good' 'means' 'not' 'poor' 'quality' 'service'
 'tasty']


In [37]:
sample1='Food quality is not good$'
sample2='awesome food'
corpus=[sample1,sample2]
corpus1=list(map(cleaning,corpus))
corpus2=cv.transform(corpus1).toarray()
print(model.predict(corpus2))
print(model.predict_proba(corpus2))

['neg' 'pos']
[[0.67 0.33]
 [0.42 0.58]]


In [41]:
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
corpus1=list(map(cleaning,corpus))
cv=CountVectorizer(max_features=None,min_df=1,max_df=2)
X=cv.fit_transform(corpus1)
print(cv.get_feature_names_out())

['cheap' 'costly' 'means' 'not' 'poor' 'quality' 'service' 'tasty']


In [47]:
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
corpus1=list(map(cleaning,corpus))
cv=CountVectorizer(ngram_range=(1,2))
X=cv.fit_transform(corpus1)
print(cv.get_feature_names_out())

['cheap' 'cheap quality' 'costly' 'food' 'food good' 'food not'
 'food tasty' 'good' 'good good' 'means' 'means poor' 'not' 'not good'
 'poor' 'poor means' 'poor poor' 'quality' 'quality good' 'service'
 'service poor' 'tasty']


In [51]:
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
corpus1=list(map(cleaning,corpus))
cv=CountVectorizer(binary=True)
X=cv.fit_transform(corpus1).toarray()
print(X)

[[0 0 1 1 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 1]
 [0 0 0 1 0 0 0 1 0 0]
 [0 0 1 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 1 0 1 0]
 [0 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 0]]


In [53]:
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
cv=CountVectorizer(lowercase=True,stop_words=sp)
X=cv.fit_transform(corpus)
print(cv.get_feature_names_out())

['ap' 'che' 'costly' 'f5o' 'food' 'good' 'means' 'not' 'od' 'poor'
 'quality' 'servi89ce' 'tasty' 'to_o']


In [65]:
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
corpus1=list(map(cleaning,corpus))
tf=TfidfVectorizer()
X=tf.fit_transform(corpus1)
print(cv.get_feature_names_out())
print(X.toarray())

['cheap' 'costly' 'food' 'good' 'means' 'not' 'poor' 'quality' 'service'
 'tasty']
[[0.         0.         0.4472136  0.89442719 0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.57866699 0.         0.         0.
  0.         0.         0.         0.81556393]
 [0.         0.         0.         0.64974959 0.         0.
  0.         0.76014832 0.         0.        ]
 [0.         0.         0.5008545  0.5008545  0.         0.70589627
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.30151134 0.
  0.90453403 0.         0.30151134 0.        ]
 [0.         1.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.76944876 0.         0.         0.         0.         0.
  0.         0.63870855 0.         0.        ]]


In [67]:
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
corpus1=list(map(cleaning,corpus))
cv=CountVectorizer()
X=cv.fit_transform(corpus1)
print(cv.get_feature_names_out())

['cheap' 'costly' 'food' 'good' 'means' 'not' 'poor' 'quality' 'service'
 'tasty']


In [69]:
print(X.toarray())

[[0 0 1 2 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 1]
 [0 0 0 1 0 0 0 1 0 0]
 [0 0 1 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 3 0 1 0]
 [0 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 0]]


In [71]:
print(corpus1)

['food good good', 'food tasty', 'quality good', 'food not good', 'service poor poor means poor', 'costly', 'cheap quality']


## Calculation tfidf using formnula (Term Frequency Inverse Document Frequency)

Working of TfidfVectorizer:(Term Frequency Inverse Document Frequency)
----------------------------------------------------------------------

Example:
doc1='food is # good! _@ 2019'
doc2='& Food # is * tasty'
doc3='quality is Good'
doc4='service is Poor poor means very poor'
doc5='it is too costly'
doc6='cheap quality'


step-1:change all documents in lower case.


step-2:remove punctuation characters from all docs.


step-3:remove all single letter words.


step-4:if stop_words argument is provided,remove all
stop words from all docs.


step-5:collect unique words from corpus



step-6:arrange thsese words in natural order
2019,cheap,costly,food,good,means,poor,quality,service,tasty

these are feature names.

step-7:for each word find out it's term frequency and inverse document frequency


Term Frequency(tf) = frequency of word in particular document

Document Frequency =for a term t, is a number of documents in which t appears

Idf = ln [ (1 + n) / (1 + df(t)) ] + 1

here,
ln -> natural log(i.e log with base e)

n->no of documents


step-8:multiply tf and idf of this word 

	score=tf*idf

step-9:normalize this score by using l2 normalization(Unit Euclidean formula)

	norm_score(ns)=score/sqrt of sum of sqr of vector(doc) scores




In [106]:
import math
tf=1
df=3
idf=math.log((1+7)/(1+df))+1
score=tf*idf
score1=score**2

In [108]:
tf2=2
score2=tf2*idf
score3=score2**2

In [110]:
new_score=score1+score3
new_score

14.33373687519046

In [112]:
sqrt=math.sqrt(new_score)
sqrt

3.7859921916441484

In [118]:
ns=score/sqrt
ns1=score2/sqrt
print(ns)  # Food normalize score
print(ns1)  # Good normalize score

0.4472135954999579
0.8944271909999159


In [120]:
corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
corpus1=list(map(cleaning,corpus))
tf=TfidfVectorizer(binary=True)
X=tf.fit_transform(corpus1)
print(cv.get_feature_names_out())
print(X.toarray())

['cheap' 'costly' 'food' 'good' 'means' 'not' 'poor' 'quality' 'service'
 'tasty']
[[0.         0.         0.70710678 0.70710678 0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.57866699 0.         0.         0.
  0.         0.         0.         0.81556393]
 [0.         0.         0.         0.64974959 0.         0.
  0.         0.76014832 0.         0.        ]
 [0.         0.         0.5008545  0.5008545  0.         0.70589627
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.57735027 0.        ]
 [0.         1.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.76944876 0.         0.         0.         0.         0.
  0.         0.63870855 0.         0.        ]]


## Sentiment analysis (resturant dataset)

In [135]:
df=pd.read_csv('G:/dataset/sentiment/Restaurant_Reviews.txt',sep='\t')
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [143]:
corpus=df['Review']
target=df['Liked']

In [153]:
corpus1=list(map(cleaning,corpus))
cv=CountVectorizer()
X=cv.fit_transform(corpus1)
print(cv.get_feature_names_out())
print(len(cv.get_feature_names_out()))
X1=X.toarray()

['absolute' 'absolutely' 'absolutley' ... 'yum' 'yummy' 'zero']
1832


In [155]:
model=RandomForestClassifier(n_estimators=100)
model.fit(X1,target)

In [157]:
sample1='Food quality is not good$'
sample2='awesome food'
corpus_test=[sample1,sample2]
corpus_test_new=list(map(cleaning,corpus_test))
X_test=cv.transform(corpus_test_new)
print(model.predict(X_test))
print(model.predict_proba(X_test))

[0 1]
[[0.73 0.27]
 [0.02 0.98]]


In [169]:
sample=input("Enter Your Valuable Feedback:- \n")
corpus_test=[sample]
corpus_test_new=list(map(cleaning,corpus_test))
X_test=cv.transform(corpus_test_new)
print(model.predict(X_test))
pred=model.predict_proba(X_test)
print(pred)
if pred[0][0]>=0.50:
    print("You Don't Like Restaurant")
else:
    print("You Liked Restaurant")

Enter Your Valuable Feedback:- 
 food is not good


[0]
[[0.78 0.22]]
You Don't Like Restaurant
