In [12]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import spacy
from collections import Counter
from gtts import gTTS
import IPython

lang_model = spacy.load("en_core_web_sm")

In [13]:
#Load dataset into pandas dataframe
input_df = pd.read_csv('/Users/ejbeazleigh/Downloads/Assessment2Data/DataForInfoRetrievExtract/DatasetIMDB.csv')

#define stop words
sw = stopwords.words('english')

In [14]:
#Create Preprocessing function
#In summary, this code snippet applies a preprocessing function to clean and tokenize the text in the 'Text' column 
#of a dataframe, and stores the preprocessed results in a new column 'clean_txt'. 
#This preprocessing step is often performed to prepare text data for further analysis stages in this report
def preprocess(row):
    text = row['Text']
    text = re.sub(re.compile(r'[^a-zA-Z0-9\s]'),' ', text)
    text = re.sub('[0-9]','',str(text))
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in sw]
    return words

#Apply Preprocessing function to dataframe
input_df['clean_txt'] = input_df[['Text']].apply(preprocess, axis='columns')

In [4]:
import pandas as pd
#this function will simply extract and quantify everytime each movie name is used in the text column 
#of the excel file. Given i have not yet defined the difference between review or mention, the value 
#will be a sum total of the two. 
# Load the dataset into a pandas DataFrame
df = pd.read_csv('/Users/ejbeazleigh/Downloads/Assessment2Data/DataForInfoRetrievExtract/DatasetIMDB.csv')

# Define a function to search for mentions of a movie in the review text
def search_movie(movie_title):
    movie_reviews = []
    for index, row in df.iterrows():
        review = row[1]
        if movie_title.lower() in review.lower():
            movie_reviews.append(review)
    return movie_reviews
movie_title = 'The Lion King'
movie_reviews = search_movie(movie_title)
print(f"Found {len(movie_reviews)} reviews mentioning '{movie_title}':")
#for review in movie_reviews:
    #print(review)
movie_title = 'Star Wars'
movie_reviews = search_movie(movie_title)
print(f"Found {len(movie_reviews)} reviews mentioning '{movie_title}':")
#for review in movie_reviews:
    #print(review)
movie_title = 'Starship Troopers'
movie_reviews = search_movie(movie_title)
print(f"Found {len(movie_reviews)} reviews mentioning '{movie_title}':")

Found 44 reviews mentioning 'The Lion King':
Found 78 reviews mentioning 'Star Wars':
Found 18 reviews mentioning 'Starship Troopers':


In [15]:
#In summary, this code snippet demonstrates the creation of an inverted index, adding documents to the index,
#and using the index to retrieve mentions and actual reviews based on a query. 
#The inverted index facilitates efficient search and retrieval of documents based on the presence of specific words.

def inverted_index(words): 
    inverted = {}
    for idx, word in enumerate(words):
        locations = inverted.setdefault(word, [])
        locations.append(idx)
    return inverted

def inverted_index_add(inverted, doc_id, doc_index):
    for word in doc_index.keys():
        locations = doc_index[word]
        indicies = inverted.setdefault(word, {})
        indicies[doc_id] = locations
    return inverted

def get_reviews(query):
    inverted_main = {}

    for idx, words in enumerate(list(input_df['clean_txt'].values)):
        inverted_main = inverted_index_add(inverted_main, idx, inverted_index(words))

    men = []
    ac = []
    query_words = [i for i in query.split(' ') if i not in sw]
    for word in query_words:
        mentions = []
        actual = []
        for i in inverted_main[word].keys():
            if len(inverted_main[word][i]) > 1:
                actual.append(input_df['Id'][i])
            else:
                mentions.append(input_df['Id'][i])
        men.append(mentions)
        ac.append(actual)
    mentions = list(set(men[0]).intersection(men[1]))
    actual = list(set(ac[0]).intersection(ac[1]))
    mentions.sort()
    actual.sort()
    return mentions, actual

In [8]:
#The get_director function extracts director names from the reviews based on certain patterns 
#and part-of-speech tagging. It checks for phrases like 'directed by' and the standalone 
#word 'director' to identify the director's name in the review text. 
#The extracted director names are returned as a list.

def get_director(reviewids):
    found = []
    for reviewid in reviewids:
        review = input_df.loc[input_df['Id'] == reviewid, 'Text'].iloc[0]
        clean = re.sub(re.compile(r'[^a-zA-Z0-9\s]'),' ', review)
        clean = re.sub('[0-9]','',str(clean))
        clean = word_tokenize(clean)
        for i in range(len(clean) - 1):
            if clean[i].lower() + ' ' + clean[i+1].lower() == 'directed by':
                tagged = nltk.pos_tag(clean)
                #print(tagged[i],tagged[i+1])
                if tagged[i+2][1] == 'NNP' or tagged[i+3] == 'NNP':
                    director = f'{tagged[i+2][0]} {tagged[i+3][0]}'
                    found.append(director)
        if len(found) == 0:
            for i,word in enumerate(clean):
                if word == 'director':
                    #print(clean[i])
                    tagged = nltk.pos_tag(clean)
                    #print(tagged[i],tagged[i+1])
                    if tagged[i+1][1] == 'NNP' or tagged[i+2][1] == 'NNP':
                        director = f'{tagged[i+1][0]} {tagged[i+2][0]}'
                        found.append(director)
    return found

In [16]:
#The get_avg function calculates the average rating of the reviews corresponding to the provided review IDs. 
#It determines whether the average rating is considered "good" or "bad" based on a threshold of 0.6.
#0.6 is a personal preference which i adopt, when using IDMB to decide if i want to watch a film or not
#The function returns a tuple containing the average rating and the corresponding label ('good' or 'bad').


def get_avg(reviewids):
    reviewsdf = input_df[input_df['Id'].isin(reviewids)]
    avg = round(reviewsdf['Rating'].values.mean(),2)
    if avg > 0.6:
        return avg, 'good'
    else:
        return avg, 'bad'

In [21]:
#The below code takes user input for a film name, retrieves relevant information about the film from the dataset, 
#and presents the information both as a dataframe and as speech in an audio format
film_name = input('Enter the name of a film: ')
mentions, reviews = get_reviews(film_name)
directors = get_director(reviews)
avg_score, goodbad = get_avg(reviews)
outputdf = pd.DataFrame(columns=['Director of film','Review IDs','Mention IDs','Average rating (0.0 - 1.0)'])
outputdf.loc[len(outputdf)] = [str(directors),str(reviews),str(mentions),str(avg_score)]
pd.set_option('max_colwidth', None)
display(outputdf)


speech = f'''You have requested information about {film_name}, the directors of this movie are {' and '.join(directors)}, there is a total
of {len(reviews)} reviews of this movie in the database, there are {len(mentions)} mentions of this movie in other movies
reviews. The average review score for this movie is {avg_score}, which makes it a {goodbad} recommendation to watch'''
tts = gTTS(speech, lang='en', tld='co.uk', slow=False)
filename = f'TTS {film_name}.mp3'
tts.save(filename)
IPython.display.Audio(filename)

Enter the name of a film: the lion king


Unnamed: 0,Director of film,Review IDs,Mention IDs,Average rating (0.0 - 1.0)
0,"['Sam Weisman', 'Rob Minkoff']","[2261, 2661, 2670, 3558, 3568, 3768, 3792, 4464, 5513, 8103, 8264, 9751, 15087, 22231]","[2262, 2727, 2758, 3098, 3532, 3535, 4499, 4981, 5369, 5691, 5790, 5841, 6121, 7836, 8287, 9609, 9837, 10023, 10116, 12090, 12936, 12950, 13316, 18866, 18890, 23535, 24533, 25950, 28395]",0.69


In [22]:
film_name = input('Enter the name of a film: ')
mentions, reviews = get_reviews(film_name)
directors = get_director(reviews)
avg_score, goodbad = get_avg(reviews)
outputdf = pd.DataFrame(columns=['Director of film','Review IDs','Mention IDs','Average rating (0.0 - 1.0)'])
outputdf.loc[len(outputdf)] = [str(directors),str(reviews),str(mentions),str(avg_score)]
pd.set_option('max_colwidth', None)
display(outputdf)


speech = f'''You have requested information about {film_name}, the directors of this movie are {' and '.join(directors)}, there is a total
of {len(reviews)} reviews of this movie in the database, there are {len(mentions)} mentions of this movie in other movies
reviews. The average review score for this movie is {avg_score}, which makes it a {goodbad} recommendation to watch'''
tts = gTTS(speech, lang='en', tld='co.uk', slow=False)
filename = f'TTS {film_name}.mp3'
tts.save(filename)
IPython.display.Audio(filename)

Enter the name of a film: star wars


Unnamed: 0,Director of film,Review IDs,Mention IDs,Average rating (0.0 - 1.0)
0,"['Richard Marquand', 'George Lucas']","[5049, 5055, 5100, 5592, 6782, 6829, 6949, 6979, 7065, 7114, 7212, 7309, 7533, 7536, 13536, 13636, 18265, 18433, 18485, 25766]","[2035, 2582, 2985, 3742, 4125, 4486, 5542, 5616, 6348, 6417, 6971, 7015, 7163, 7624, 7676, 9638, 10230, 11025, 11468, 11641, 12057, 12547, 12602, 12890, 14953, 15031, 15420, 16619, 17316, 18779, 21544, 21610, 22584, 22991, 24553, 24561, 28787, 29024, 29716]",0.74


In [23]:
film_name = input('Enter the name of a film: ')
mentions, reviews = get_reviews(film_name)
directors = get_director(reviews)
avg_score, goodbad = get_avg(reviews)
outputdf = pd.DataFrame(columns=['Director of film','Review IDs','Mention IDs','Average rating (0.0 - 1.0)'])
outputdf.loc[len(outputdf)] = [str(directors),str(reviews),str(mentions),str(avg_score)]
pd.set_option('max_colwidth', None)
display(outputdf)


speech = f'''You have requested information about {film_name}, the director of this movie is {' and '.join(directors)}, there is a total
of {len(reviews)} reviews of this movie in the database, there are {len(mentions)} mentions of this movie in other movies
reviews. The average review score for this movie is {avg_score}, which makes it a {goodbad} recommendation to watch'''
tts = gTTS(speech, lang='en', tld='co.uk', slow=False)
filename = f'TTS {film_name}.mp3'
tts.save(filename)
IPython.display.Audio(filename)

Enter the name of a film: starship troopers


Unnamed: 0,Director of film,Review IDs,Mention IDs,Average rating (0.0 - 1.0)
0,['Paul Verhoeven'],"[9654, 9671, 9673, 11595, 26255]","[9915, 10859, 11468, 11808, 11987, 12012, 12224, 13316, 13629, 14580, 17219, 22372, 25567]",0.66
