## This file focuses on rest of the raw data.
 - Including - final_dataset.csv, IMDB-Movie-Data.csv, Movie_data.csv, movies.csv

In [2]:
import pandas as pd
import numpy as np

In [None]:
path1 = 'raw data/final_dataset.csv'
path2 = 'raw data/IMDB-Movie-Data.csv'
path3 = 'raw data/Movie_data.csv'
path4 = 'raw data/movies.csv'

In [None]:
# Helper functions
def edit_distance(str1, str2):
    m, n = len(str1), len(str2)
    
    # Initialize a 2D DP table
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Fill the DP table
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j  # If str1 is empty, insert all characters of str2
            elif j == 0:
                dp[i][j] = i  # If str2 is empty, remove all characters of str1
            elif str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]  # Characters match, no cost
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],    # Remove
                                   dp[i][j - 1],    # Insert
                                   dp[i - 1][j - 1])  # Replace

    return dp[m][n]

def detect_outliers_iqr(data):
    """
    Detects outliers in a list or numpy array using the IQR method.

    Args:
      data: A list or numpy array of numerical data.

    Returns:
      A list of outlier values.
    """
    if not isinstance(data, (list, np.ndarray)):
        raise TypeError("Input data must be a list or numpy array.")
    if len(data) == 0:
        return []

    data = np.array(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return list(outliers)

def initial_inspection(file):
    df = pd.read_csv(file)
    print(f'Column attributes: {df.columns}')
    print(f'-------------------------------------------------------------------------------------------------------------------------------------------------------')
    print(f'DF shape: {df.shape}')
    print(f'-------------------------------------------------------------------------------------------------------------------------------------------------------')
    print(df.head())
    return df

### Initial Import

#### final_dataset.csv

This dataset has important attributes like actors name, gross income, and budget information. An potential problem with this data set is that it didn't come with a date.

In [12]:
raw1 = initial_inspection(path1)

Column attributes: Index(['name', 'genre', 'score', 'director', 'actor_2_name', 'actor_1_name',
       'gross', 'budget'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (7118, 8)
-------------------------------------------------------------------------------------------------------------------------------------------------------
                                             name      genre  score  \
0                                     The Shining      Drama    8.4   
1                                 The Blue Lagoon  Adventure    5.8   
2  Star Wars: Episode V - The Empire Strikes Back     Action    8.7   
3                                       Airplane!     Comedy    7.7   
4                                      Caddyshack     Comedy    7.3   

          director        actor_2_name      actor_1_name        gross  \
0  Stanley Kubrick      Shelley

#### IMDB-Movie-Data.csv

This data file also have some interesting variable such as movie description, full actor list, and movie runtime. But there's a lot of fact checking that we have to do during the merge. 

In [13]:
raw2 = initial_inspection(path2)

Column attributes: Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (1000, 12)
-------------------------------------------------------------------------------------------------------------------------------------------------------
   Rank                    Title                     Genre  \
0     1  Guardians of the Galaxy   Action,Adventure,Sci-Fi   
1     2               Prometheus  Adventure,Mystery,Sci-Fi   
2     3                    Split           Horror,Thriller   
3     4                     Sing   Animation,Comedy,Family   
4     5            Suicide Squad  Action,Adventure,Fantasy   

                                         Description              Director  \
0  A group of i

#### Movie_data.csv 

In [None]:
raw3 = initial_inspection(path3) 

Column attributes: Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 'runtime',
       'status', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords', 'poster_path', 'backdrop_path', 'recommendations',
       'trailer_views', 'trailer_likes'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (17034, 22)
-------------------------------------------------------------------------------------------------------------------------------------------------------
       id                              title  \
0  615656                  Meg 2: The Trench   
1  758323                The Pope's Exorcist   
2  667538   Transformers: Rise of the Beasts   
3  640146  Ant-Man and the Wasp: Quantumania   
4  677179                          Creed III   

     

#### movies.csv

In [15]:
raw4 = initial_inspection(path4)

Column attributes: Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (7668, 15)
-------------------------------------------------------------------------------------------------------------------------------------------------------
                                             name rating      genre  year  \
0                                     The Shining      R      Drama  1980   
1                                 The Blue Lagoon      R  Adventure  1980   
2  Star Wars: Episode V - The Empire Strikes Back     PG     Action  1980   
3                                       Airplane!     PG     Comedy  1980   
4                                      Caddyshack      R     Comedy  1980   

  

### Fact checking

We will be checking if the important variable matches (if not, then at least one of the merged source is not reliable)

In [41]:
# Let's start with raw1 and raw2 first
movie_stg0 = raw1.merge(raw2, left_on=['name', 'director'], right_on=['Title', 'Director'])
movie_stg0.shape

(573, 20)

Movies seem to match after the inital merged using a pair of composite key, but there's only a few hundred of matched movies. 

In [42]:
movie_stg0.isna().sum()

name                   0
genre                  0
score                  0
director               0
actor_2_name           0
actor_1_name           0
gross                  0
budget                 0
Rank                   0
Title                  0
Genre                  0
Description            0
Director               0
Actors                 0
Year                   0
Runtime (Minutes)      0
Rating                 0
Votes                  0
Revenue (Millions)     7
Metascore             22
dtype: int64

In [53]:
# removing unrelevant/redundent attributes
movie_stg1 = movie_stg0.drop(['name', 'genre', 'score', 'director', 'actor_2_name', 'actor_1_name', 'Rank'], axis=1)

In [66]:
# Let's continue with raw3
movie_stg1.rename(columns={'budget': 'Budget'}, inplace=True)

movie_stg2 = movie_stg1.join(raw3.set_index('title'), on='Title')

In [68]:
movie_stg2.shape

(613, 34)

In [67]:
movie_stg2.isna().sum()

gross                    0
Budget                   0
Title                    0
Genre                    0
Description              0
Director                 0
Actors                   0
Year                     0
Runtime (Minutes)        0
Rating                   0
Votes                    0
Revenue (Millions)       7
Metascore               24
id                       6
genres                   6
original_language        6
overview                 6
popularity               6
production_companies     8
release_date             6
budget                   6
revenue                  6
runtime                  6
status                   6
tagline                 40
vote_average             6
vote_count               6
credits                  8
keywords                10
poster_path              6
backdrop_path            8
recommendations         16
trailer_views            6
trailer_likes            6
dtype: int64

In [70]:
movie_stg2.columns

Index(['gross', 'Budget', 'Title', 'Genre', 'Description', 'Director',
       'Actors', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes',
       'Revenue (Millions)', 'Metascore', 'id', 'genres', 'original_language',
       'overview', 'popularity', 'production_companies', 'release_date',
       'budget', 'revenue', 'runtime', 'status', 'tagline', 'vote_average',
       'vote_count', 'credits', 'keywords', 'poster_path', 'backdrop_path',
       'recommendations', 'trailer_views', 'trailer_likes'],
      dtype='object')

In [89]:
movie_stg2.head()

Unnamed: 0,gross,Budget,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,...,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations,trailer_views,trailer_likes
0,291465373.0,90000000.0,The Departed,"Crime,Drama,Thriller",An undercover cop and a mole in the police att...,Martin Scorsese,"Leonardo DiCaprio, Matt Damon, Jack Nicholson,...",2006,151,8.5,...,Lies. Betrayal. Sacrifice. How far will you ta...,8.164,13185.0,Jack Nicholson-Leonardo DiCaprio-Matt Damon-Ma...,undercover-boston massachusetts-irish-american...,/nT97ifVT2J1yMQmeq20Qblg61T.jpg,/6WRrGYalXXveItfpnipYdayFkQB.jpg,11324-769-1124-16869-807-106646-98-500-857-640...,18113320.0,9019804.0
1,158964610.0,85000000.0,The Fast and the Furious: Tokyo Drift,"Action,Crime,Thriller",A teenager becomes a major competitor in the w...,Justin Lin,"Lucas Black, Zachery Ty Bryan, Shad Moss, Dami...",2006,104,6.0,...,"On the streets of Tokyo, speed needs no transl...",6.4,5782.0,Lucas Black-Nathalie Kelley-Sung Kang-Shad Mos...,car race-car journey-car mechanic-car garage-a...,/cm2ffqb3XovzA5ZSzyN3jnn8qv0.jpg,/dMARcKLrv0T7kVJ4iQR3vqTTdtT.jpg,584-13804-9799-51497-82992-168259-337339-98833...,5714873.0,2590683.0
2,109676311.0,40000000.0,The Prestige,"Drama,Mystery,Sci-Fi",Two stage magicians engage in competitive one-...,Christopher Nolan,"Christian Bale, Hugh Jackman, Scarlett Johanss...",2006,130,8.5,...,Are You Watching Closely?,8.203,13936.0,Hugh Jackman-Christian Bale-Michael Caine-Scar...,competition-obsession-magic-diary-dying and de...,/bdN3gXuIZYaJP7ftKK2sU0nPtEA.jpg,/mfJepkInUbiZ0mFXFhDNz8ko6Zr.jpg,77-11324-1422-807-27205-272-155-16869-550-629-...,-2292396.0,4893181.0
3,461991867.0,120000000.0,Cars,"Animation,Adventure,Comedy",A hot-shot race-car named Lightning McQueen ge...,John Lasseter,"Owen Wilson, Bonnie Hunt, Paul Newman, Larry t...",2006,117,7.1,...,Ahhh... it's got that new movie smell.,6.9,12346.0,Owen Wilson-Larry the Cable Guy-Bonnie Hunt-Pa...,car race-success-route 66-porsche-retirement-f...,/u4G8EkiIBZYx0wEg2xDlXZigTOZ.jpg,/sd4xN5xi8tKRPrJOWwNiZEile7f.jpg,49013-12-863-585-9806-862-10193-2062-953-14160...,20515320.0,11453550.0
4,456068181.0,65000000.0,300,"Action,Fantasy,War",King Leonidas of Sparta and a force of 300 men...,Zack Snyder,"Gerard Butler, Lena Headey, David Wenham, Domi...",2006,117,7.7,...,"Spartans, prepare for glory!",7.167,12330.0,Gerard Butler-Lena Headey-Dominic West-David W...,evisceration-javelin-shield-army-epic-based on...,/9W49fy5G7v9Ed3CXtvMi41YqZtt.jpg,/eGhjeUbzttA3E4flxdAm8gHz4h4.jpg,53182-14161-98-6479-10528-36557-27578-604-607-...,1832150.0,12364730.0


In [90]:
benchmark_distance = 12.15  # Average edit distance of the first 20 movies
distance_list = []

for i in range(movie_stg2.shape[0]):
    try:
        # Get the genres for the current row
        genre1 = movie_stg2.iloc[i, 3]
        genre2 = movie_stg2.iloc[i, 14]

        # Check if either genre is null or NaN
        if pd.isnull(genre1) or pd.isnull(genre2):
            print(f"Null value at index {i}: Skipping this row.")
            distance_list.append(None)  # Append None for rows with null values
            continue

        # Compute the edit distance if both genres are valid
        distance = edit_distance(genre1, genre2)
        distance_list.append(distance)

    except ValueError:
        print(f"ValueError at index {i}: Skipping this row.")
        distance_list.append(None)
    except Exception as e:
        print(f"Unexpected error at index {i}: {e}")
        distance_list.append(None)


Null value at index 29: Skipping this row.
Null value at index 337: Skipping this row.
Null value at index 533: Skipping this row.
Null value at index 575: Skipping this row.
Null value at index 576: Skipping this row.
Null value at index 598: Skipping this row.


In [104]:
distance_list.remove(None)

In [105]:
len(distance_list)

607

In [106]:
np.mean(distance_list)

np.float64(12.99835255354201)