## Data Processing Part4

data source = https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [366]:
import pandas as pd
import numpy as np
import nltk

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score, accuracy_score
import pickle

In [368]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aruns\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [392]:
dataset = pd.read_csv('./data/imdb_reviews.csv')

In [393]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [396]:
dataset['y'] = np.where(dataset['sentiment'] == 'positive', 1,0)

In [398]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
 2   y          50000 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 976.7+ KB


In [400]:
stopset = set(stopwords.words('english'))

In [402]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=list(stopset))

In [404]:
X = vectorizer.fit_transform(dataset.review)
y = dataset.y
pickle.dump(vectorizer, open('transform.pkl', 'wb'))

In [406]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [408]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

In [410]:
accuracy_score(y_test, clf.predict(X_test))*100

86.59

In [412]:
# we will be using this model for sentiment analysis
pickle.dump(clf, open('nlp_model.pkl', 'wb'))

## Testing our content based similar recommendation solution

In [417]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def create_similarity():
    data = pd.read_csv('./main_data.csv')
    # creating a count matrix
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(data['comb'])
    # creating a similarity score matrix
    similarity = cosine_similarity(count_matrix)
    return data,similarity

In [419]:
data, similarity = create_similarity()

In [421]:
m = 'a quiet place'
i = data.loc[data['movie_title']==m].index[0]
lst = list(enumerate(similarity[i]))
lst = sorted(lst, key = lambda x:x[1] ,reverse=True)
lst = lst[1:11] # excluding first item since it is the requested movie itself
l = []
for i in range(len(lst)):
    a = lst[i][0]
    l.append(data['movie_title'][a])

print(l)

['a quiet place part ii', 'animal crackers', 'wild mountain thyme', 'midnight cowboy', 'breach', 'open secret', 'hide and seek', 'needle in a timestack', 'resident evil: welcome to raccoon city', 'amigo']


## Test our Sentiment prediction model

In [424]:
X = ['john wick is one of my greatest movie']

In [426]:
Xt = vectorizer.transform(X)

In [428]:
clf.predict(Xt)[0]

1

In [430]:
X = ['john wick movie was horrible']

In [432]:
Xt = vectorizer.transform(X)

In [434]:
clf.predict(Xt)[0]

0

## Test Rotten tomatoes critics review web scraping

In [437]:
import bs4 as bs
import urllib.request
import pandas as pd
import numpy as np
import requests
import pickle

In [439]:
# load the nlp model and tfidf vectorizer from disk
filename = 'nlp_model.pkl'
clf = pickle.load(open(filename, 'rb'))
vectorizer = pickle.load(open('transform.pkl','rb'))


# web scraping to get user reviews from IMDB site
title = 'anaconda'
sauce = urllib.request.urlopen('https://www.rottentomatoes.com/m/{}/reviews?type=top_critics'.format(title)).read()
soup = bs.BeautifulSoup(sauce,'lxml')
soup_result = soup.find_all("div",{"class":"review-text-container"})

In [441]:
reviews_list = [] # list of reviews
reviews_status = [] # list of comments (good or bad)
for review_container in soup_result:
    review_text = review_container.find("p", {"class":"review-text"}).text.strip()
    if review_text:
        reviews_list.append(review_text)
        # passing the review to our model
        movie_review_list = np.array([review_text])
        movie_vector = vectorizer.transform(movie_review_list) # Naive Bayes Sentiment Prediction model
        pred = clf.predict(movie_vector)
        reviews_status.append('Good' if pred else 'Bad')

# combining reviews and comments into a dictionary
movie_reviews = {reviews_list[i]: reviews_status[i] for i in range(len(reviews_list))}   

In [443]:
movie_reviews

{'The monster looks like a maniacal garden hose in a couple of sequences. Still, it delivers the necessary thrills and chills.': 'Bad',
 'Anaconda, directed by Luis Llosa with all of the subtlety of a snake-oil salesman, is in the great tradition of cinematic cheese, as processed as Kraft Singles slices.': 'Bad',
 'A silly and plodding Jaws rip-off about a 40-foot man-eating snake on the prowl in the Brazilian rain forest.': 'Bad',
 "One never questions the realism of the remarkable animatronic and computer-generated effects, but it's hard to credit a snake that screams.": 'Good',
 'Anaconda is about a snake that eats everybody. That about says it all.': 'Bad',
 'Anaconda is such a classic combination of feckless dramaturgy and rampant excess that giving way to giggles is the only sane response.': 'Bad',
 'Charmless, unfrightening, and even devoid of the requisite gratuitous nudity, Anaconda just plain bites.': 'Bad',
 "It's a slick, scary, funny Creature Feature, beautifully photograp

In [453]:

# web scraping to get user reviews from IMDB site
title = 'the_marvels'
sauce = urllib.request.urlopen('https://www.rottentomatoes.com/m/{}/reviews?type=user'.format(title)).read()
soup = bs.BeautifulSoup(sauce,'lxml')
soup_result = soup.find_all("div",{"class":"review-text-container"})

reviews_list = [] # list of reviews
reviews_status = [] # list of comments (good or bad)
for review_container in soup_result:
    review_text = review_container.find("p", {"class":"audience-reviews__review js-review-text"}).text.strip()
    if review_text:
        reviews_list.append(review_text)
        # passing the review to our model
        movie_review_list = np.array([review_text])
        movie_vector = vectorizer.transform(movie_review_list) # Naive Bayes Sentiment Prediction model
        pred = clf.predict(movie_vector)
        reviews_status.append('Good' if pred else 'Bad')

# combining reviews and comments into a dictionary
movie_reviews = {reviews_list[i]: reviews_status[i] for i in range(len(reviews_list))}   

In [455]:
movie_reviews

{'I thought I\'d give this a chance. I didn\'t find it as cringe as some say but instead found it rather bland and boring. I once heard someone describing bad storey telling as "and then" stories, The Marvels is this type of storey telling.': 'Bad',
 'One of the better recent marvel movies and actually good. Loved Kamala in this movie.': 'Good',
 'the end credit scenes were more important than the actual plot. nothing important happened here.': 'Bad',
 "A fun space movie, in general.\n\nI would agree with the issues of tonal shifts, and for me it even had the character inconsistencies of Carol (Captain Marvel, Avengers) and Monica Rambeau (Wandavision), the two just felt different in this movie. Maybe it's really an issue of the tonal shifts.\n\nBut the action sequences had me on choke hold lmao. The action especially the parts when they switch places still leaves me in awe.\n\nI watched this on a regular cinema when it came out, and recently, again, on Disney+.": 'Good',
 'Not rally a