In [7]:
!pip install omdb
from omdb import OMDBClient
import pandas as pd
import json
import numpy as np
import re

class ObtainData: 
    # Below initialize method needs two inputs - one is the path to the movie title file and other is the key for accessing OMDB data
    def __init__(self,ipMovieTitlePath,omdbKey,opMovieDetailsPath):
        # Get the movie titles from the movie_titles.csv file 
        # This file is downloaded from https://www.kaggle.com/netflix-inc/netflix-prize-data#movie_titles.csv
        # File contains the titles of movies and years. We will later use the movie titles extracted from this file to fetch movie information.
        global dfMovieList, omdbClient, opPath
        dfMovieList = pd.read_csv(ipMovieTitlePath,header=None,names=['MovieID','ReleaseYear','MovieTitle'],encoding='latin-1')
        defaultYear = 2000
        dfMovieList['ReleaseYear'] = dfMovieList['ReleaseYear'].fillna(defaultYear)
        # Create a client of OMDB API
        omdbClient = OMDBClient(apikey=omdbKey)
        opPath = opMovieDetailsPath
    
    # This method will fetch the data from the OMDB database and store it in opPath in csv format. This has to run just once.
    def fetchData(self):
        # dfOMDBdata wil hold all the data extracted from omdb. It will keep appending new row each time a new data is fetched
        global dfOMDBdata
        # columnRenamingDict will be used to rename the output dataframe, so as to not rely on the column names provided by data obtained from OMDB API
        columnRenamingDict = {'NeflixMovieTitle':'NeflixMovieTitle','NeflixMovieID':'NeflixMovieID','NetflixReleaseYear':'NetflixReleaseYear','awards':'Awards','actors':'Actors','box_office':'Box_Office','country':'Country','director':'Director','dvd':'DVD','genre':'Genre','imdb_id':'IMDBId','imdb_rating':'IMDBRating','imdb_votes':'IMDBVotes','language':'Language','metascore':'Metascore','plot':'Plot','poster':'Poster','production':'Production','rated':'Rated','ratings':'Ratings','released':'Released','response':'Response','runtime':'Runtime','title':'OMDBTitle','tomato_consensus':'TomatoConsensus','tomato_fresh':'TomatoFresh','tomato_image':'TomatoImage','tomato_meter':'TomatoMeter','tomato_rating':'TomatoRating','tomato_reviews':'TomatoReviews','tomato_rotten':'TomatoRotten','tomato_url':'TomatoUrl','tomato_user_meter':'TomatoUserMeter','tomato_user_rating':'TomatoUserRating','tomato_user_reviews':'TomatoUserReviews','total_seasons':'TotalSeasons','type':'Type','website':'Website','writer':'Writer','year':'Year'}
        dfOMDBdata = pd.DataFrame()
        for index,row in dfMovieList.iterrows():
            # If the apikey has restriction of per day data fetch. Then uncomment the below if condition.
            #if(index < 1000):
                print('Fecthing movie data.... count = '+str(index))                
                movieDetails = omdbClient.get(title=row['MovieTitle'],year=int(row['ReleaseYear']),fullplot=True,tomatoes=True)
                jsonObject = json.dumps(movieDetails)
                dfCurrMovie = pd.read_json(jsonObject)
                dfCurrMovie['NeflixMovieTitle'] = row['MovieTitle']
                dfCurrMovie['NeflixMovieID'] = row['MovieID']
                dfCurrMovie['NetflixReleaseYear'] = row['ReleaseYear']
                dfOMDBdata = dfOMDBdata.append(dfCurrMovie,ignore_index=True)               
        dfOMDBdata = dfOMDBdata.rename(columns=columnRenamingDict)
        dfOMDBdata.to_csv(opPath,index=False)
    
    # Use below method to obtain the data from the csv file which contains all the movie related data obtained from OMDB 
    def getDataFromCSV(self):
        global dfOMDBdata
        dfOMDBdata = pd.read_csv(opMovieDetailsPath,error_bad_lines = False)
        #dfOMDBdata = dfOMDBdata.rename(columns=columnRenamingDict)
        

    
    def preprocessData(self):
        # Drop the columns which are not needed.
        self.getDataFromCSV()
        dropColumns = ['NeflixMovieID','NeflixMovieTitle','NetflixReleaseYear','Box_Office','TomatoConsensus','TomatoFresh','TomatoImage','TomatoMeter','TomatoRating','TomatoReviews','TomatoRotten','TomatoUrl','TomatoUserMeter','TomatoUserRating','TomatoUserReviews','Response','Website','Plot','Poster','IMDBId']
        dfOMDBdata.drop(dropColumns, axis = 1, inplace = True)
        #dfOMDBdata.drop(dfOMDBdata.index[150:], inplace=True)

        # Preprocess individual columns     
        for idx,row in dfOMDBdata.iterrows():
            YearArray = re.split('–|-',dfOMDBdata.loc[idx,'Year'])
            # Split the Year for series type as StartYear and EndYear
            if(dfOMDBdata.loc[idx,'Type'] == "series"):
                dfOMDBdata.loc[idx,'Year'] = YearArray[0]
                dfOMDBdata.loc[idx,'StartYear'] = YearArray[0]
                if(len(YearArray) == 2 and YearArray[1] != ""):
                    dfOMDBdata.loc[idx,'EndYear'] = YearArray[1]
                else:
                    dfOMDBdata.loc[idx,'EndYear'] = YearArray[0]
                    
                if(dfOMDBdata.loc[idx,'TotalSeasons'] == np.nan):
                    dfOMDBdata.loc[idx,'TotalSeasons'] = 1
            # For movies Year, StartYear and EndYear remains the same
            else:
                dfOMDBdata.loc[idx,'Year'] = YearArray[0]
                dfOMDBdata.loc[idx,'StartYear'] = YearArray[0]
                dfOMDBdata.loc[idx,'EndYear'] = YearArray[0]
        #dummies = dfOMDBdata['Actors'].str.get_dummies(",")
        #print(dummies.columns)
        dfOMDBdata.to_csv('movieDetailsPreprocessed.csv',index=False)
        
ipMovieTitlePath = 'movie_titles.csv'
apikey = '3ed5bc9c'
opMovieDetailsPath = 'movie_details.csv'
dataObj = ObtainData(ipMovieTitlePath,apikey,opMovieDetailsPath)
# Below line of code is commented. It can be run initially to get the csv output from OMDB. We have already extracted the data and kept as opMovieDetailsPath.
#dataObj.fetchData()
dataObj.preprocessData()


