In [1]:
import pandas as pd
import zipfile


In [2]:
class MovieFilter():
    """Class MovieFilter to handle reading the csv from the zip file and search"""

    def __init__(self, path):
        """Extract the file and create dataframe"""
        zf = zipfile.ZipFile(path)
        df = pd.read_csv(zf.open(zf.namelist()[0]), index_col = 0)
        self.df = df

    def search(self, title=None, release_date=None, overview=None, popularity=None, vote_average=None, vote_count=None):
        """search for a specific movie"""

        filteredRows = []
        if title is not None:
            ls = list(self.df[self.df['title']==title].index)
            if len(ls) == 0:
                return []
            if len(filteredRows) == 0:
                filteredRows = ls
            else :
                filteredRows = list(set(ls).intersection(set(filteredRows)))

        if release_date is not None:
            ls = list(self.df[self.df['release_date']==release_date].index)
            if len(ls) == 0:
                return []
            if len(filteredRows) == 0:
                filteredRows = ls
            else :
                filteredRows = list(set(ls).intersection(set(filteredRows)))

        if overview is not None:
            ls = list(self.df[self.df['overview'].astype(str).str.contains(overview)].index)
            if len(ls) == 0:
                return []
            if len(filteredRows) == 0:
                filteredRows = ls
            else :
                filteredRows = list(set(ls).intersection(set(filteredRows)))

        if popularity is not None:
            ls = list(self.df[self.df['popularity']==popularity].index)
            if len(ls) == 0:
                return []
            if len(filteredRows) == 0:
                filteredRows = ls
            else :
                filteredRows = list(set(ls).intersection(set(filteredRows)))

        if vote_average is not None:
            ls = list(self.df[self.df['vote_average']==vote_average].index)
            if len(ls) == 0:
                return []
            if len(filteredRows) == 0:
                filteredRows = ls
            else :
                filteredRows = list(set(ls).intersection(set(filteredRows)))

        if vote_count is not None:
            ls = list(self.df[self.df['vote_count']==vote_count].index)
            if len(ls) == 0:
                return []
            if len(filteredRows) == 0:
                filteredRows = ls
            else :
                filteredRows = list(set(ls).intersection(set(filteredRows)))

        return filteredRows

    def get_full_rows(self, rows):
        """Return full rows from the data frame with the rows numbers"""

        return self.df.loc[rows, :]


In [3]:
filter = MovieFilter('Movie-Dataset-Latest (1).zip')

In [4]:
#test 1
rows = filter.search(title = None, release_date=None, overview="love", popularity=None, vote_average=8.7, vote_count=None)
filter.get_full_rows(rows)

Unnamed: 0,id,title,release_date,overview,popularity,vote_average,vote_count,video
0,19404,Dilwale Dulhania Le Jayenge,10/20/1995,"Raj is a rich, carefree, happy-go-lucky second...",25.884,8.7,3304,False
1,278,The Shawshank Redemption,9/23/1994,Framed in the 1940s for the double murder of h...,60.11,8.7,20369,False


In [5]:
#test 2
rows = filter.search(title = None, release_date=None, overview="blood", popularity=None, vote_average=5.0, vote_count=None)
filter.get_full_rows(rows)

Unnamed: 0,id,title,release_date,overview,popularity,vote_average,vote_count,video
9169,346651,Pandemic,2/26/2016,After a virus of epic proportions overwhelms t...,64.096,5.0,251,False
9165,293572,The Houses October Built,10/10/2014,Beneath the fake blood and cheap masks of coun...,7.116,5.0,241,False


In [6]:
#test 3
rows = filter.search(title = None, release_date=None, overview="love", popularity=None, vote_average=8.7, vote_count=None)
filter.get_full_rows(rows)

Unnamed: 0,id,title,release_date,overview,popularity,vote_average,vote_count,video
0,19404,Dilwale Dulhania Le Jayenge,10/20/1995,"Raj is a rich, carefree, happy-go-lucky second...",25.884,8.7,3304,False
1,278,The Shawshank Redemption,9/23/1994,Framed in the 1940s for the double murder of h...,60.11,8.7,20369,False


In [7]:
#test 4
rows = filter.search(title = "The Shawshank Redemption", release_date=None, overview="love", popularity=None, vote_average=8.7, vote_count=None)
filter.get_full_rows(rows)

Unnamed: 0,id,title,release_date,overview,popularity,vote_average,vote_count,video
1,278,The Shawshank Redemption,9/23/1994,Framed in the 1940s for the double murder of h...,60.11,8.7,20369,False
