### This file focuses on finding a balance between movie records and number of attributes
 - The goal is to merge the offical IMDB data with budget and gross income information to ensure minimum modeling 
 - End result - A csv file named 'balanced.csv' that has only 9371 records and around 26 attributes (Can be lesser, some variables might be unrelevant)
 - The only downfall of this approach is that we are missing the actor attributes

In [1]:
import pandas as pd
import numpy as np

In [2]:
imdb = 'data/imdb_data.csv'
path1 = 'raw data/final_dataset.csv'
path2 = 'raw data/IMDB-Movie-Data.csv'
path3 = 'raw data/Movie_data.csv'
path4 = 'raw data/movies.csv'
path5 = 'raw data/imdb_movies.csv'

reading_list = [path1, path2, path3, path4, path5]

In [3]:
def initial_inspection(file):
    df = pd.read_csv(file)
    return df

#### Initial Inspection

In [4]:
index = 1
for i in reading_list:
    df = initial_inspection(i)
    print(f'path {index}: has shape {df.shape} and columns {df.columns}')
    print('------------------------------------------------------------------------------------------------------------')
    index += 1

path 1: has shape (7118, 8) and columns Index(['name', 'genre', 'score', 'director', 'actor_2_name', 'actor_1_name',
       'gross', 'budget'],
      dtype='object')
------------------------------------------------------------------------------------------------------------
path 2: has shape (1000, 12) and columns Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')
------------------------------------------------------------------------------------------------------------
path 3: has shape (17034, 22) and columns Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 'runtime',
       'status', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords', 'poster_path', 'backdrop_path', 'recommendations',
       'trailer_views', 'trailer_likes'],
    

In [5]:
imdb_df = pd.read_csv(imdb)
print(imdb_df.shape)
print(imdb_df.columns)

(324324, 11)
Index(['Unnamed: 0', 'titleId', 'primaryTitle', 'originalTitle', 'titleType',
       'startYear', 'genres', 'directors', 'writers', 'averageRating',
       'numVotes'],
      dtype='object')


  imdb_df = pd.read_csv(imdb)


In [6]:
selected_df = pd.read_csv(path3)
print(selected_df.shape)
print(selected_df.columns)

(17034, 22)
Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 'runtime',
       'status', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords', 'poster_path', 'backdrop_path', 'recommendations',
       'trailer_views', 'trailer_likes'],
      dtype='object')


#### Merge and fact checking

In [7]:
imdb_df.rename(columns={'genres': 'Genre'}, inplace=True)

In [8]:
imdb_df.iloc[0]

Unnamed: 0                     0
titleId                tt0000009
primaryTitle          Miss Jerry
originalTitle         Miss Jerry
titleType                  movie
startYear                   1894
Genre                    Romance
directors        Alexander Black
writers          Alexander Black
averageRating                5.4
numVotes                   218.0
Name: 0, dtype: object

In [9]:
year_released = []

for i in range(selected_df.shape[0]):
    try:
        year_released.append(selected_df.iloc[i,7][:4])
    except Exception as e:
        print(f'Unexpected error at index {i}: {e}')
        year_released.append(None)

Unexpected error at index 6412: 'float' object is not subscriptable
Unexpected error at index 6591: 'float' object is not subscriptable
Unexpected error at index 9601: 'float' object is not subscriptable
Unexpected error at index 10807: 'float' object is not subscriptable
Unexpected error at index 11433: 'float' object is not subscriptable
Unexpected error at index 11555: 'float' object is not subscriptable
Unexpected error at index 11664: 'float' object is not subscriptable
Unexpected error at index 11818: 'float' object is not subscriptable
Unexpected error at index 12007: 'float' object is not subscriptable
Unexpected error at index 12487: 'float' object is not subscriptable
Unexpected error at index 12500: 'float' object is not subscriptable
Unexpected error at index 12793: 'float' object is not subscriptable
Unexpected error at index 13267: 'float' object is not subscriptable
Unexpected error at index 13306: 'float' object is not subscriptable
Unexpected error at index 13361: 'flo

In [10]:
selected_df['year_released'] = year_released

In [11]:
movie_stg0 = imdb_df.merge(selected_df,
                           left_on=['primaryTitle', 'startYear'],
                           right_on=['title', 'year_released'])

In [12]:
movie_stg0.shape

(9371, 34)

In [13]:
movie_stg1 = movie_stg0.drop(['Unnamed: 0', 'titleType', 'id', 'titleId', 'tagline', 'poster_path', 'backdrop_path', 'recommendations'], axis=1)

In [14]:
movie_stg1.isna().sum()

primaryTitle               0
originalTitle              0
startYear                  0
Genre                      0
directors                 17
writers                  247
averageRating              0
numVotes                   0
title                      0
genres                    59
original_language          0
overview                  96
popularity                 0
production_companies     533
release_date               0
budget                     0
revenue                    0
runtime                   14
status                     0
vote_average               0
vote_count                 0
credits                   50
keywords                1465
trailer_views              0
trailer_likes              0
year_released              0
dtype: int64

In [15]:
movie_stg1.shape

(9371, 26)

#### Checking for redundent movie

In [None]:
len(movie_stg1[['primaryTitle', 'startYear']].drop_duplicates())

8854

In [28]:
redundant_dict = {}

for i in range(movie_stg1.shape[0]):
    movie_name = movie_stg1.iloc[i, 0]  # Use square brackets for iloc
    year = movie_stg1.iloc[i, 2]
    if (movie_name, year) not in redundant_dict:
        redundant_dict[(movie_name, year)] = (1, [i])  # Initialize with 1 occurrence and the index
    else:
        count, indices = redundant_dict[(movie_name, year)]  # Unpack the tuple
        redundant_dict[(movie_name, year)] = (count + 1, indices + [i])  # Update the count and append index

In [29]:
filtered_list = [(key, index) for key, (count, index) in redundant_dict.items() if count > 1]
for i in filtered_list:
    print(i)

(('The Big Swap', '1998'), [54, 55])
(('Reine & Mimmi i fjällen!', '1997'), [212, 213])
(('Soldier', '1998'), [230, 881])
(('An Ideal Husband', '1999'), [396, 601])
(('Life', '1999'), [404, 405])
(('Money No Enough', '1998'), [422, 423])
(('The Ride', '1997'), [432, 1321])
(('Ride with the Devil', '1999'), [468, 1007])
(('Men Cry Bullets', '1998'), [559, 560])
(('Last Night', '1998'), [581, 1313])
(('The Beach', '2000'), [628, 1591])
(('Limbo', '1999'), [633, 8274])
(('Gossip', '2000'), [714, 988])
(('Between Your Legs', '1999'), [717, 2609])
(('Australian Made: The Movie', '1987'), [816, 817])
(('O Astrapogiannos', '1970'), [824, 825, 826])
(('Monsieur Zivaco', '1967'), [838, 839])
(('Dil Kya Kare', '1999'), [906, 907])
(('Secret Society', '2000'), [912, 913])
(('Shool', '1999'), [932, 933])
(('Devdas', '2002'), [993, 1561])
(('Jungle', '2000'), [1083, 1084])
(('Connect 5', '1996'), [1093, 1094])
(('Jackpot', '2001'), [1143, 1144])
(('Todo menos la chica', '2002'), [1146, 1147])
(('Se

In [33]:
print(movie_stg1.iloc[212])

primaryTitle                                     Reine & Mimmi i fjällen!
originalTitle                                    Reine & Mimmi i fjällen!
startYear                                                            1997
Genre                                                              Comedy
directors                                                Magnus Skogsberg
writers                 Hans Rosenfeldt,Peter Emanuel Falck,Christian ...
averageRating                                                         2.6
numVotes                                                            901.0
title                                            Reine & Mimmi i fjällen!
genres                                                             Comedy
original_language                                                      sv
overview                Mimmi starts to get enough Reine just work all...
popularity                                                          1.761
production_companies                  

In [34]:
print(movie_stg1.iloc[213])

primaryTitle                                     Reine & Mimmi i fjällen!
originalTitle                                    Reine & Mimmi i fjällen!
startYear                                                            1997
Genre                                                              Comedy
directors                                                Magnus Skogsberg
writers                 Hans Rosenfeldt,Peter Emanuel Falck,Christian ...
averageRating                                                         2.6
numVotes                                                            901.0
title                                            Reine & Mimmi i fjällen!
genres                                                             Comedy
original_language                                                      sv
overview                Mimmi starts to get enough Reine just work all...
popularity                                                          1.057
production_companies                  

In [37]:
movie_stg2 = movie_stg1.drop_duplicates(['primaryTitle', 'startYear'])

In [38]:
movie_stg2.shape

(8854, 26)

In [39]:
movie_stg2.to_csv('data/balanced.csv')