In [41]:
import json
import pandas as pd

df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,rank,id,name,year,imbd_votes,imdb_rating,certificate,duration,genre,cast_id,cast_name,director_id,director_name,writter_name,writter_id
0,1,tt0111161,The Shawshank Redemption,1994,2601152,9.3,A,142,Drama,"nm0000209,nm0000151,nm0348409,nm0006669,nm0000...","Tim Robbins,Morgan Freeman,Bob Gunton,William ...",nm0001104,Frank Darabont,"Stephen King,Frank Darabont","nm0000175,nm0001104"
1,2,tt0068646,The Godfather,1972,1796656,9.2,A,175,"Crime,Drama","nm0000008,nm0000199,nm0001001,nm0000473,nm0144...","Marlon Brando,Al Pacino,James Caan,Diane Keato...",nm0000338,Francis Ford Coppola,"Mario Puzo,Francis Ford Coppola","nm0701374,nm0000338"
2,3,tt0468569,The Dark Knight,2008,2572662,9.0,UA,152,"Action,Crime,Drama","nm0000288,nm0005132,nm0001173,nm0000323,nm0350...","Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",nm0634240,Christopher Nolan,"Jonathan Nolan,Christopher Nolan,David S. Goyer","nm0634300,nm0634240,nm0275286"
3,4,tt0071562,The Godfather Part II,1974,1237934,9.0,A,202,"Crime,Drama","nm0000199,nm0000134,nm0000380,nm0000473,nm0001...","Al Pacino,Robert De Niro,Robert Duvall,Diane K...",nm0000338,Francis Ford Coppola,"Francis Ford Coppola,Mario Puzo","nm0000338,nm0701374"
4,5,tt0050083,12 Angry Men,1957,768548,9.0,U,96,"Crime,Drama","nm0000020,nm0002011,nm0000842,nm0275835,nm0550...","Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...",nm0001486,Sidney Lumet,Reginald Rose,nm0741627


### 1. Year Based Custom Dataset
##### 1.1) CSV Based

In [32]:
data = []

for year in sorted(df['year'].unique()):
    
    data.append([year,  len(df[df['year'] == year]), ','.join(df[df['year'] == year]['id'].values)])
    
df_ = pd.DataFrame(data, columns = ['year','no_of_movies','movie_ids'])

df_.to_csv('datasets/year.csv', index = False)

df_.head()

Unnamed: 0,year,no_of_movies,movie_ids
0,1921,1,tt0012349
1,1924,1,tt0015324
2,1925,1,tt0015864
3,1926,1,tt0017925
4,1927,1,tt0017136


##### 1.2) JSON Based

In [47]:
dct = {}

for year in sorted(df['year'].unique()):
    
    dct[str(year)] =  {'freq' : len(df[df['year'] == year]), 
                  'movie_ids' : ','.join(df[df['year'] == year]['id'].values)}
    
dct = json.dumps(dct)

fd = open('datasets/year.json','w')
fd.write(dct)
fd.close()

### 2. Basic Movies Info Dataset

In [60]:
data = []

for i in df.values[:,1:3]:    
    data.append([i[0], i[1], 'https://www.imdb.com/title/' + i[0]])
    
df_ = pd.DataFrame(data, columns = ['id','name','link'])

df_.to_csv('datasets/basic_movie_info.csv', index = False)

### 3. Genre - Custom Dataset

##### 3.1). JSON Based

In [99]:
genre = []

for i in df['genre']:           # Going through Each Genre of the Movie "Horror,Crime"
    genre += i.split(',')       # Splitting all the Genre ["Horror", "Crime"]
genre = list(set(genre))        # Getting Unique Genre

dct = {}

for i in genre:                 # Going through each unique Genre
    c = 0
    t = {}                      # Temporary dictionary to save movie names and ids
    
    for gen in range(len(df['genre'])):       
        if(i in df['genre'][gen]):        
            t[df['id'][gen]] = df['name'][gen]       
    dct[i] = t
    
dct = json.dumps(dct)

fd = open('datasets/genre.json','w')
fd.write(dct)
fd.close()

### 4. Members Involved
##### 4.1) Adding All Directors Writter and Cast Members in One Column

In [146]:
df['members_id'] = df['director_id'] +','+ df['writter_id'] + ',' + df['cast_id']

df.head()

Unnamed: 0,rank,id,name,year,imbd_votes,imdb_rating,certificate,duration,genre,cast_id,cast_name,director_id,director_name,writter_name,writter_id,members_id
0,1,tt0111161,The Shawshank Redemption,1994,2601152,9.3,A,142,Drama,"nm0000209,nm0000151,nm0348409,nm0006669,nm0000...","Tim Robbins,Morgan Freeman,Bob Gunton,William ...",nm0001104,Frank Darabont,"Stephen King,Frank Darabont","nm0000175,nm0001104","nm0001104,nm0000175,nm0001104,nm0000209,nm0000..."
1,2,tt0068646,The Godfather,1972,1796656,9.2,A,175,"Crime,Drama","nm0000008,nm0000199,nm0001001,nm0000473,nm0144...","Marlon Brando,Al Pacino,James Caan,Diane Keato...",nm0000338,Francis Ford Coppola,"Mario Puzo,Francis Ford Coppola","nm0701374,nm0000338","nm0000338,nm0701374,nm0000338,nm0000008,nm0000..."
2,3,tt0468569,The Dark Knight,2008,2572662,9.0,UA,152,"Action,Crime,Drama","nm0000288,nm0005132,nm0001173,nm0000323,nm0350...","Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",nm0634240,Christopher Nolan,"Jonathan Nolan,Christopher Nolan,David S. Goyer","nm0634300,nm0634240,nm0275286","nm0634240,nm0634300,nm0634240,nm0275286,nm0000..."
3,4,tt0071562,The Godfather Part II,1974,1237934,9.0,A,202,"Crime,Drama","nm0000199,nm0000134,nm0000380,nm0000473,nm0001...","Al Pacino,Robert De Niro,Robert Duvall,Diane K...",nm0000338,Francis Ford Coppola,"Francis Ford Coppola,Mario Puzo","nm0000338,nm0701374","nm0000338,nm0000338,nm0701374,nm0000199,nm0000..."
4,5,tt0050083,12 Angry Men,1957,768548,9.0,U,96,"Crime,Drama","nm0000020,nm0002011,nm0000842,nm0275835,nm0550...","Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...",nm0001486,Sidney Lumet,Reginald Rose,nm0741627,"nm0001486,nm0741627,nm0000020,nm0002011,nm0000..."


##### 4.2) Making Paris of Directors, Writters and Cast Members

In [176]:
def make_pairs(dct, id_, name_):
    
    id_   = id_.split(',')
    name_ = name_.split(',')
    
    for i in range(len(id_)):
        dct[id_[i]] = name_[i]    
    return dct

director_dct = {}
writter_dct  = {}
cast_dct     = {}

for i in df.values:
    
    director_dct = make_pairs(director_dct, i[11], i[12])     # Director Pair
    writter_dct  = make_pairs(writter_dct , i[14], i[13])     # Writter Pair
    cast_dct     = make_pairs(cast_dct    , i[9] , i[10])     # Cast Pairs
    
print("Total Director     : " ,len(director_dct))
print("Total Writter      : " ,len(writter_dct))
print("Total Cast Members : " ,len(cast_dct))

Total Director     :  177
Total Writter      :  436
Total Cast Members :  3835


##### 4.3) Finding the Unique Members  (Directors + Cast  + Writters)

In [178]:
lst = []

for i in df['members_id']:
    
    lst +=  i.split(',')
    
print("Total Members: ", len(lst))
print("Unique Members: ", len(set(lst)))

members = list(set(lst))

Total Members:  5270
Unique Members:  4302


##### 4.4) Making Pairs of Unique Members

In [191]:
member_dct   = {}

for i in members:
    
    if i in cast_dct.keys():
        member_dct[i] = cast_dct[i]
    
    elif i in director_dct.keys():
        member_dct[i] = director_dct[i]
            
    elif i in writter_dct.keys():
        member_dct[i] = writter_dct[i]

##### 4.5) Creating JSON for Directors, Cast, Writters

In [196]:
fd = open('datasets/directors.json','w')
fd.write(json.dumps(director_dct))
fd.close()

fd = open('datasets/writters.json','w')
fd.write(json.dumps(writter_dct))
fd.close()

fd = open('datasets/cast.json','w')
fd.write(json.dumps(cast_dct))
fd.close()

##### 4.6) Creating CSV for all Members

In [201]:
data = []

for k in member_dct.keys():
    data.append([k, member_dct[k]])
    
pd.DataFrame(data , columns = ['id','name']).to_csv('datasets/members.csv',index = False)

##### 4.7) Creating JSON for all Members

In [209]:
dct = {}

for k in member_dct.keys():
    dct[k] = member_dct[k]
    
print(len(dct))

unique_members = dct.keys()

4302


### 5. Finding the the person who has worked on most of the Movies

In [231]:
data = []

for i in unique_members:
    c = 0
    for j in df['members_id']:
    
        if(i in j.split(',')):
            c += 1                    #j.split(',').count(i)
    
    data.append([i,c, dct[i]])
    
df_ = pd.DataFrame(data, columns = ['id','freq','name'])

df_.to_csv('datasets/person_movie_specific.csv', index = False)

df_.sort_values(by = 'freq', ascending = False).head()

Unnamed: 0,id,freq,name
1467,nm0000134,9,Robert De Niro
4047,nm0000040,7,Stanley Kubrick
3797,nm0000217,7,Martin Scorsese
1643,nm0634240,7,Christopher Nolan
2841,nm0000148,7,Harrison Ford


### 6. Finding the the person who has worked on most of the Movies (Same Movie in Multiple Roles)

In [245]:
data = []

for i in unique_members:
    c = 0
    for j in df['members_id']:
    
        if(i in j.split(',')):
            c += j.split(',').count(i)
    
    data.append([i,c, dct[i]])
    
df_ = pd.DataFrame(data, columns = ['id','freq','name'])

df_.to_csv('datasets/person_role_specific.csv', index = False)

df_.sort_values(by = 'freq', ascending = False).head()

Unnamed: 0,id,freq,name
2062,nm0000122,15,Charles Chaplin
1643,nm0634240,14,Christopher Nolan
4047,nm0000040,14,Stanley Kubrick
2463,nm0000041,13,Akira Kurosawa
54,nm0000233,11,Quentin Tarantino


In [246]:
df.head()

Unnamed: 0,rank,id,name,year,imbd_votes,imdb_rating,certificate,duration,genre,cast_id,cast_name,director_id,director_name,writter_name,writter_id,members_id
0,1,tt0111161,The Shawshank Redemption,1994,2601152,9.3,A,142,Drama,"nm0000209,nm0000151,nm0348409,nm0006669,nm0000...","Tim Robbins,Morgan Freeman,Bob Gunton,William ...",nm0001104,Frank Darabont,"Stephen King,Frank Darabont","nm0000175,nm0001104","nm0001104,nm0000175,nm0001104,nm0000209,nm0000..."
1,2,tt0068646,The Godfather,1972,1796656,9.2,A,175,"Crime,Drama","nm0000008,nm0000199,nm0001001,nm0000473,nm0144...","Marlon Brando,Al Pacino,James Caan,Diane Keato...",nm0000338,Francis Ford Coppola,"Mario Puzo,Francis Ford Coppola","nm0701374,nm0000338","nm0000338,nm0701374,nm0000338,nm0000008,nm0000..."
2,3,tt0468569,The Dark Knight,2008,2572662,9.0,UA,152,"Action,Crime,Drama","nm0000288,nm0005132,nm0001173,nm0000323,nm0350...","Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",nm0634240,Christopher Nolan,"Jonathan Nolan,Christopher Nolan,David S. Goyer","nm0634300,nm0634240,nm0275286","nm0634240,nm0634300,nm0634240,nm0275286,nm0000..."
3,4,tt0071562,The Godfather Part II,1974,1237934,9.0,A,202,"Crime,Drama","nm0000199,nm0000134,nm0000380,nm0000473,nm0001...","Al Pacino,Robert De Niro,Robert Duvall,Diane K...",nm0000338,Francis Ford Coppola,"Francis Ford Coppola,Mario Puzo","nm0000338,nm0701374","nm0000338,nm0000338,nm0701374,nm0000199,nm0000..."
4,5,tt0050083,12 Angry Men,1957,768548,9.0,U,96,"Crime,Drama","nm0000020,nm0002011,nm0000842,nm0275835,nm0550...","Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...",nm0001486,Sidney Lumet,Reginald Rose,nm0741627,"nm0001486,nm0741627,nm0000020,nm0002011,nm0000..."


### 7. Basic Members Info

In [247]:
df_['link'] = 'https://www.imdb.com/name/' + df_['id']
del df_['freq']

df_.to_csv('datasets/basic_member_info.csv', index = False)

In [249]:
df_['link']

0       https://www.imdb.com/name/nm0949985
1       https://www.imdb.com/name/nm0238105
2       https://www.imdb.com/name/nm0502425
3       https://www.imdb.com/name/nm0843775
4       https://www.imdb.com/name/nm0908824
                       ...                 
4297    https://www.imdb.com/name/nm0360796
4298    https://www.imdb.com/name/nm0000984
4299    https://www.imdb.com/name/nm0565883
4300    https://www.imdb.com/name/nm8904193
4301    https://www.imdb.com/name/nm0002128
Name: link, Length: 4302, dtype: object