## This file focuses on rest of the raw data.
 - Including - final_dataset.csv, IMDB-Movie-Data.csv, Movie_data.csv, movies.csv
 - End result - A csv file named 'many_attributes.csv' that has only 505 records, but around 30 attributes

In [1]:
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:
path1 = 'raw data/final_dataset.csv'
path2 = 'raw data/IMDB-Movie-Data.csv'
path3 = 'raw data/Movie_data.csv'
path4 = 'raw data/movies.csv'

In [3]:
# Helper functions
def edit_distance(str1, str2):
    m, n = len(str1), len(str2)
    
    # Initialize a 2D DP table
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Fill the DP table
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j  # If str1 is empty, insert all characters of str2
            elif j == 0:
                dp[i][j] = i  # If str2 is empty, remove all characters of str1
            elif str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]  # Characters match, no cost
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],    # Remove
                                   dp[i][j - 1],    # Insert
                                   dp[i - 1][j - 1])  # Replace

    return dp[m][n]

def detect_outliers(distance_list, threshold=2):
    """
    Detects outliers based on a specified threshold (in terms of standard deviations).
    
    Parameters:
    - distance_list: List of tuples (index, distance) where distance can be None.
    - threshold: Number of standard deviations to use as the outlier threshold.
    
    Returns:
    - outliers: List of tuples (index, distance) for outliers.
    """
    # Filter out invalid entries (None or incorrectly formatted)
    valid_entries = [entry for entry in distance_list if isinstance(entry, tuple) and entry[1] is not None]

    if not valid_entries:
        print("No valid distances to process.")
        return []

    # Extract indices and distances
    indices, distances = zip(*valid_entries)  # Separate indices and distances

    # Calculate mean and standard deviation
    mean_distance = np.mean(distances)
    std_distance = np.std(distances)

    # Define outlier thresholds
    lower_bound = mean_distance - threshold * std_distance
    upper_bound = mean_distance + threshold * std_distance

    # Detect outliers
    outliers = [(idx, dist) for idx, dist in valid_entries if dist < lower_bound or dist > upper_bound]

    # Print summary
    print(f"Mean Distance: {mean_distance:.2f}")
    print(f"Standard Deviation: {std_distance:.2f}")
    print(f"Lower Bound: {lower_bound:.2f}, Upper Bound: {upper_bound:.2f}")

    return outliers



def initial_inspection(file):
    df = pd.read_csv(file)
    print(f'Column attributes: {df.columns}')
    print(f'-------------------------------------------------------------------------------------------------------------------------------------------------------')
    print(f'DF shape: {df.shape}')
    print(f'-------------------------------------------------------------------------------------------------------------------------------------------------------')
    print(df.head())
    return df

### Initial Import

#### final_dataset.csv

This dataset has important attributes like actors name, gross income, and budget information. An potential problem with this data set is that it didn't come with a date.

In [4]:
raw1 = initial_inspection(path1)

Column attributes: Index(['name', 'genre', 'score', 'director', 'actor_2_name', 'actor_1_name',
       'gross', 'budget'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (7118, 8)
-------------------------------------------------------------------------------------------------------------------------------------------------------
                                             name      genre  score  \
0                                     The Shining      Drama    8.4   
1                                 The Blue Lagoon  Adventure    5.8   
2  Star Wars: Episode V - The Empire Strikes Back     Action    8.7   
3                                       Airplane!     Comedy    7.7   
4                                      Caddyshack     Comedy    7.3   

          director        actor_2_name      actor_1_name        gross  \
0  Stanley Kubrick      Shelley

#### IMDB-Movie-Data.csv

This data file also have some interesting variable such as movie description, full actor list, and movie runtime. But there's a lot of fact checking that we have to do during the merge. 

In [5]:
raw2 = initial_inspection(path2)

Column attributes: Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (1000, 12)
-------------------------------------------------------------------------------------------------------------------------------------------------------
   Rank                    Title                     Genre  \
0     1  Guardians of the Galaxy   Action,Adventure,Sci-Fi   
1     2               Prometheus  Adventure,Mystery,Sci-Fi   
2     3                    Split           Horror,Thriller   
3     4                     Sing   Animation,Comedy,Family   
4     5            Suicide Squad  Action,Adventure,Fantasy   

                                         Description              Director  \
0  A group of i

#### Movie_data.csv 

In [6]:
raw3 = initial_inspection(path3) 

Column attributes: Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 'runtime',
       'status', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords', 'poster_path', 'backdrop_path', 'recommendations',
       'trailer_views', 'trailer_likes'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (17034, 22)
-------------------------------------------------------------------------------------------------------------------------------------------------------
       id                              title  \
0  615656                  Meg 2: The Trench   
1  758323                The Pope's Exorcist   
2  667538   Transformers: Rise of the Beasts   
3  640146  Ant-Man and the Wasp: Quantumania   
4  677179                          Creed III   

     

#### movies.csv

In [7]:
raw4 = initial_inspection(path4)

Column attributes: Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime'],
      dtype='object')
-------------------------------------------------------------------------------------------------------------------------------------------------------
DF shape: (7668, 15)
-------------------------------------------------------------------------------------------------------------------------------------------------------
                                             name rating      genre  year  \
0                                     The Shining      R      Drama  1980   
1                                 The Blue Lagoon      R  Adventure  1980   
2  Star Wars: Episode V - The Empire Strikes Back     PG     Action  1980   
3                                       Airplane!     PG     Comedy  1980   
4                                      Caddyshack      R     Comedy  1980   

  

### Fact checking

We will be checking if the important variable matches (if not, then at least one of the merged source is not reliable)

In [8]:
# Let's start with raw1 and raw2 first
movie_stg0 = raw1.merge(raw2, left_on=['name', 'director'], right_on=['Title', 'Director'])
movie_stg0.shape

(573, 20)

Movies seem to match after the inital merged using a pair of composite key, but there's only a few hundred of matched movies. 

In [9]:
movie_stg0.isna().sum()

name                   0
genre                  0
score                  0
director               0
actor_2_name           0
actor_1_name           0
gross                  0
budget                 0
Rank                   0
Title                  0
Genre                  0
Description            0
Director               0
Actors                 0
Year                   0
Runtime (Minutes)      0
Rating                 0
Votes                  0
Revenue (Millions)     7
Metascore             22
dtype: int64

In [10]:
# removing unrelevant/redundent attributes
movie_stg1 = movie_stg0.drop(['name', 'genre', 'score', 'director', 'actor_2_name', 'actor_1_name', 'Rank'], axis=1)

In [11]:
# Let's continue with raw3
movie_stg1.rename(columns={'budget': 'Budget'}, inplace=True)

movie_stg2 = movie_stg1.merge(raw3, left_on=['Title', 'Budget'], right_on=['title', 'budget'])
#movie_stg2 = movie_stg1.join(raw3.set_index('title'), on='Title')

In [12]:
movie_stg2.shape

(505, 35)

In [13]:
movie_stg2.isna().sum()

gross                    0
Budget                   0
Title                    0
Genre                    0
Description              0
Director                 0
Actors                   0
Year                     0
Runtime (Minutes)        0
Rating                   0
Votes                    0
Revenue (Millions)       7
Metascore               20
id                       0
title                    0
genres                   0
original_language        0
overview                 0
popularity               0
production_companies     2
release_date             0
budget                   0
revenue                  0
runtime                  0
status                   0
tagline                 22
vote_average             0
vote_count               0
credits                  0
keywords                 0
poster_path              0
backdrop_path            0
recommendations          0
trailer_views            0
trailer_likes            0
dtype: int64

In [14]:
movie_stg2.columns

Index(['gross', 'Budget', 'Title', 'Genre', 'Description', 'Director',
       'Actors', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes',
       'Revenue (Millions)', 'Metascore', 'id', 'title', 'genres',
       'original_language', 'overview', 'popularity', 'production_companies',
       'release_date', 'budget', 'revenue', 'runtime', 'status', 'tagline',
       'vote_average', 'vote_count', 'credits', 'keywords', 'poster_path',
       'backdrop_path', 'recommendations', 'trailer_views', 'trailer_likes'],
      dtype='object')

In [15]:
# removing unrelevant/redundent attributes
movie_stg3 = movie_stg2.drop(['id', 'genres', 'overview', 'Budget', 'runtime'], axis=1)

In [16]:
movie_stg3.isna().sum()

gross                    0
Title                    0
Genre                    0
Description              0
Director                 0
Actors                   0
Year                     0
Runtime (Minutes)        0
Rating                   0
Votes                    0
Revenue (Millions)       7
Metascore               20
title                    0
original_language        0
popularity               0
production_companies     2
release_date             0
budget                   0
revenue                  0
status                   0
tagline                 22
vote_average             0
vote_count               0
credits                  0
keywords                 0
poster_path              0
backdrop_path            0
recommendations          0
trailer_views            0
trailer_likes            0
dtype: int64

In [17]:
movie_stg3.shape

(505, 30)

In [18]:
# Let's continue with raw4
raw4.columns

Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime'],
      dtype='object')

raw4 file doesn't seem to contain interesting/valuable information so we will avoid raw4

In [19]:
movie_stg3.to_csv('data/many_attributes.csv')