In [6]:
#Amazon_reviews sentiment prediction
import pandas as pd

reviews_df=pd.read_csv('Amazon_reviews.csv')


In [14]:
#EVery word - Token
#Tokenize - divide whole text into tokens(words)

import nltk
from nltk.tokenize import word_tokenize 

word_tokenize('the rain is dropping slowly. do you observe ?')

['the', 'rain', 'is', 'dropping', 'slowly', '.', 'do', 'you', 'observe', '?']

In [29]:
#Word tokenizer includes ? also. so go for regexp
from nltk.tokenize import RegexpTokenizer

#RegularExpression used to fetch data based on given pattern

import re

sent='India got independence on 15-08-1947 and constitution came into effect on 26-01-1950'

#extract dates --- they have pattern xx-xx-xxxx

re.findall(r'\d{1,2}-\d{1,2}-\d{4}',sent)

#So lets use Regular expression as tokenizer instead of word_tokenizer



['15-08-1947', '26-01-1950']

In [31]:
tokenizer=RegexpTokenizer(r'\w+')     #---> W - words tokenizing +--> more than 1 word

tokenizer.tokenize('the rain is dropping slowly. do you observe ?')

['the', 'rain', 'is', 'dropping', 'slowly', 'do', 'you', 'observe']

In [58]:
#remove stopwords.
from nltk.corpus import stopwords

sw_eng=stopwords.words('english')
sw_eng.append('much')


In [45]:
#Lemmatozation

from nltk.stem import WordNetLemmatizer,PorterStemmer

lemmatizer=WordNetLemmatizer()
stemmer=PorterStemmer()  

#Stemmer --> Raining = Rain.   FLying=FLy.... It dpesnt work always well comparing with Wordnetlemmetizer. ex: cactus - catci
#Lemmatizer - Knowledge based as its work wordnet(kind of dictionary) centric

lemmatizer.lemmatize('seeing',pos='v')

'see'

In [46]:
#So we have to perform 3 steps for all reviews
# 1. tokenizer
# 2. Remove stopwords
# 3. Lemmatization

In [67]:
def preprocessing(review):
    
    tokens=tokenizer.tokenize(review)  # Tokenize
    
    pure_tokens=[token.lower() for token in tokens if token.lower() not in sw_eng]  #Remove stop words
    
    lemma_tokens=[lemmatizer.lemmatize(token,pos='v') for token in pure_tokens]  #Lemmatization
    
    return ' '.join(lemma_tokens)  # Join all words - using space

#preprocessing('Inflation has much higher rates')

In [79]:
#lets apply above logic for all rows of Reviews excel
#reviews_df['Review'].apply(preprocessing)  #---> Review is column name in excel

In [72]:
#Train test split -- before that, remove output column
y=reviews_df['Label']
reviews_df.drop(columns='Label',inplace=True)

In [78]:
#Train test split

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(reviews_df,y,test_size=0.2,random_state=42)

In [85]:
#apply preprocessing for all rows
import warnings
warnings.filterwarnings('ignore')

x_train['preprocessed_text']=x_train['Review'].apply(preprocessing)
x_test['preprocessed_text']=x_test['Review'].apply(preprocessing)

In [87]:
#Since pre_procesisng is done. before feeding the data to model, Build TFIDF matrix

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()


In [97]:
x_train_tfidf=tfidf.fit_transform(x_train['preprocessed_text'])
x_test_tfidf=tfidf.transform(x_test['preprocessed_text'])

#tfidf.get_feature_names()  --> to check token names(word format)
#pd.DataFrame(x_train_tfidf.toarray())  --> most of the terms are 0 as not all the terms present in all the docs
#list(pd.DataFrame(x_train_tfidf.toarray())[0]) --> to check tftdf values for 1 row based on index

In [105]:
#Naive base model - relays on bayes theorem . Assumes all columns are independent of each other - so lets go with niavebayes theorem
# - Multinomial naive bayes --> go for it --> data in the form of text and data has tfidf values

from sklearn.naive_bayes import MultinomialNB

mnb=MultinomialNB()

mnb.fit(x_train_tfidf,y_train)
mnb_pred=mnb.predict(x_test_tfidf)

In [106]:
#Confusion matrix

from sklearn.metrics import confusion_matrix,precision_score,recall_score

confusion_matrix(y_test,mnb_pred)

array([[ 9,  6],
       [ 2, 23]], dtype=int64)

In [107]:
recall_score(y_test,mnb_pred)

0.92

In [108]:
precision_score(y_test,mnb_pred)

0.7931034482758621