# The objective of this code is to query, using pre-exisitng packages, movies associated with Shah Rukh Khan, and store them in a dataframe. This will then be used to develope a recommendation system.

In [18]:
#Importing necessary packages
import imdb
import pandas as pd
import numpy as np

In [19]:
#Getting Shah Rukh Khans person ID
ia = imdb.IMDb()
people = ia.search_person('Shah Rukh Khan (I)')

#Using his ID to get the list of his movies
full_person = ia.get_person(people[0].getID(), info=["filmography"])

In [20]:
#Creating empty lists to store the information I want
title = []
kind = []
year = []
movieID = []

for movie in full_person['filmography']['actor']:
    title.append(movie['title'])
    kind.append(movie['kind'])
    year.append(movie['year'])
    movieID.append(movie.movieID)
    
#Creating my dataframe
df_movie = pd.DataFrame({'MovieID': movieID, 'Title': title, 'Kind': kind, 'Year': year})

#Let's remove movies that have a year that's 2020 or above, and let's remove everything that isn't a movie
df_movie = df_movie.loc[df_movie['Kind'] == 'movie']
df_movie = df_movie.loc[df_movie['Year'] < 2020]

In [21]:
ID = df_movie.iloc[[4]]['MovieID'].values[0]
movie = ia.get_movie(ID)

143

In [22]:
#Now, let's get the imdb rating, as well as the genre, and add it to our dataframe
df_movie['Rating'] = np.NaN
df_movie['Genre'] = np.NaN
df_movie['Runtime'] = np.NaN

for i in range(len(df_movie)):
    ID = df_movie.iloc[[i]]['MovieID'].values[0]
    movie = ia.get_movie(ID)
    df_movie.loc[df_movie['MovieID'] == ID, 'Rating'] = movie['rating']
    df_movie.loc[df_movie['MovieID'] == ID, 'Genre'] = str(movie['genre'])
    df_movie.loc[df_movie['MovieID'] == ID, 'Runtime'] = movie.get('runtime')
    

#Here, I will drop movies that have a very small runtime
df_movie['Runtime'].replace({'None': 0}, inplace = True)
df_movie["Runtime"] = pd.to_numeric(df_movie["Runtime"])
df_movie = df_movie.loc[df_movie['Runtime'] > 40]

In [23]:
#One hot encoding the Movie Genre
Genres = ['Music', 'Comedy', 'Drama', 'Romance', 'War', 'Action', 'Crime', 'Thiller', 'Adventure', 'Sci-Fi',
 'Musical', 'Horror', 'Mystery', 'Family', 'Biography', 'History', 'Fantasy', 'Sport']

for genre in Genres:
    df_movie[genre] = 0


for i in range(len(df_movie)):
    moviegenre = df_movie.iloc[[i]]['Genre'].values[0]
    ID = df_movie.iloc[[i]]['MovieID'].values[0]
    
    for genre in Genres:
        if genre in moviegenre:
            df_movie.loc[df_movie['MovieID'] == ID, genre] = 1
            
df_movie.drop(['Genre', 'Kind'], axis = 1, inplace = True)

#Making the runtime into hours and minutes
df_movie['Runtime'] = pd.to_datetime(df_movie['Runtime'], unit='m').dt.strftime('%H:%M')

In [24]:
df_movie.to_csv('SRKMovies.csv', index = False)

In [50]:
df_sorted = df_movie.loc[df_movie['Comedy'] == 1].sort_values(by = ['Rating'], ascending = False)
df_sorted.head()

Unnamed: 0,MovieID,Title,Year,Rating,Runtime,Music,Comedy,Drama,Romance,War,...,Adventure,Sci-Fi,Musical,Horror,Mystery,Family,Biography,History,Fantasy,Sport
49,347304,Kal ho naa ho,2003,8.0,03:06,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
87,110222,Kabhi Haan Kabhi Naa,1994,7.8,02:38,1,1,1,1,0,...,0,0,1,0,0,0,0,0,0,0
67,172684,Kuch Kuch Hota Hai,1998,7.6,02:57,1,1,1,1,0,...,0,0,1,0,0,0,0,0,0,0
30,1182937,Rab Ne Bana Di Jodi,2008,7.2,02:47,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
72,118983,Dil To Pagal Hai,1997,7.0,02:59,1,1,1,1,0,...,0,0,1,0,0,0,0,0,0,0


In [51]:
title = list(df_sorted.Title)
rating = list(df_sorted.Rating)
year = list(df_sorted.Year)
runtime = list(df_sorted.Runtime)
for i in range(len(df_sorted)):
    print('<tr>')
    print('<th scope="row">' + str(rating[i]) + '</th>')
    print('<td>' + title[i] + '</td>')
    print('<td>' + str(year[i]) + '</td>')
    print('<td>' + runtime[i] + '</td>')    
    print('</tr>')

<tr>
<th scope="row">8.0</th>
<td>Kal ho naa ho</td>
<td>2003</td>
<td>03:06</td>
</tr>
<tr>
<th scope="row">7.8</th>
<td>Kabhi Haan Kabhi Naa</td>
<td>1994</td>
<td>02:38</td>
</tr>
<tr>
<th scope="row">7.6</th>
<td>Kuch Kuch Hota Hai</td>
<td>1998</td>
<td>02:57</td>
</tr>
<tr>
<th scope="row">7.2</th>
<td>Rab Ne Bana Di Jodi</td>
<td>2008</td>
<td>02:47</td>
</tr>
<tr>
<th scope="row">7.0</th>
<td>Dil To Pagal Hai</td>
<td>1997</td>
<td>02:59</td>
</tr>
<tr>
<th scope="row">7.0</th>
<td>I'm Always Here</td>
<td>2004</td>
<td>02:59</td>
</tr>
<tr>
<th scope="row">6.9</th>
<td>Baadshah</td>
<td>1999</td>
<td>02:55</td>
</tr>
<tr>
<th scope="row">6.8</th>
<td>Raju Ban Gaya Gentleman</td>
<td>1992</td>
<td>02:32</td>
</tr>
<tr>
<th scope="row">6.8</th>
<td>Karan Arjun</td>
<td>1995</td>
<td>02:55</td>
</tr>
<tr>
<th scope="row">6.8</th>
<td>Yes Boss</td>
<td>1997</td>
<td>02:43</td>
</tr>
<tr>
<th scope="row">6.8</th>
<td>Deewana</td>
<td>1992</td>
<td>03:05</td>
</tr>
<tr>
<th scope="r