# Stage One: Data Overview

In [101]:
import pandas as pd

In [102]:
df = pd.read_csv('../data/movies_and_shows.csv', sep=";", header=1)

In [103]:
# Obtaining the first 10 rows of our data
df.head(10)

Unnamed: 0,name,Character,r0le,TITLE,Type,release Year,genres,imdb sc0re,imdb v0tes,Unnamed: 9
0,0,Robert De Niro,Travis Bickle,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
1,1,Jodie Foster,Iris Steensma,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
2,2,Albert Brooks,Tom,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
3,3,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
4,4,Cybill Shepherd,Betsy,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
5,5,Peter Boyle,Wizard,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
6,6,Leonard Harris,Senator Charles Palantine,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
7,7,Diahnne Abbott,Concession Girl,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
8,8,Gino Ardito,Policeman at Rally,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
9,9,Martin Scorsese,Passenger Watching Silhouette,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0


In [104]:
# Getting the info of our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85579 entries, 0 to 85578
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          85579 non-null  int64  
 1   Character     85579 non-null  object 
 2   r0le          85579 non-null  object 
 3   TITLE         85579 non-null  object 
 4     Type        85578 non-null  object 
 5   release Year  85579 non-null  object 
 6   genres        85579 non-null  int64  
 7   imdb sc0re    85579 non-null  object 
 8   imdb v0tes    80970 non-null  float64
 9   Unnamed: 9    80853 non-null  float64
dtypes: float64(2), int64(2), object(6)
memory usage: 6.5+ MB


**Conclusions**

Each row in the table stores data about a movie or show. The columns can be divided into two categories: the first is about the roles held by different people who worked on the movie or show (role, name of the actor or director, and character if the row is about an actor); the second category is information about the movie or show itself (title, release year, genre, imdb figures).

It's clear that there is sufficient data to do the analysis and evaluate our assumption. However, to move forward, I need to preprocess the data.

# Stage Two: Data Processing

In [105]:
# When we printed the dataset info it was clear our column names needed some help. Lets look again
df.columns

Index(['name', 'Character', 'r0le', 'TITLE', '  Type', 'release Year',
       'genres', 'imdb sc0re', 'imdb v0tes', 'Unnamed: 9'],
      dtype='object')

In [106]:
# Lets rename our columns
df = df.rename(columns={
    '   name': 'name',
    'Character': 'character',
    'r0le': 'role',
    'TITLE': 'title',
    '  Type': 'type',
    'release Year': 'release_year',
    'imdb sc0re': 'imdb_score',
    'imdb v0tes': 'imdb_votes',
 })

In [107]:
# Lets check our column names again
df.columns

Index(['name', 'character', 'role', 'title', 'type', 'release_year', 'genres',
       'imdb_score', 'imdb_votes', 'Unnamed: 9'],
      dtype='object')

We also need to check our dataset for missing values and process those missing values:

In [108]:
# Getting the sum of the missing values in our dataset

df.isna().sum()

name               0
character          0
role               0
title              0
type               1
release_year       0
genres             0
imdb_score         0
imdb_votes      4609
Unnamed: 9      4726
dtype: int64

In [109]:
# dropping rows where columns with scores, and votes have missing values
df.dropna(inplace=True)
df

# Or I can simply reassign since the dropna method creates a new dataframe
# df = df.dropna()

Unnamed: 0,name,character,role,title,type,release_year,genres,imdb_score,imdb_votes,Unnamed: 9
0,0,Robert De Niro,Travis Bickle,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
1,1,Jodie Foster,Iris Steensma,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
2,2,Albert Brooks,Tom,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
3,3,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
4,4,Cybill Shepherd,Betsy,ACTOR,Taxi Driver,MOVIE,1976,"['drama', 'crime']",8.2,808582.0
...,...,...,...,...,...,...,...,...,...,...
85574,85574,Adelaida Buscato,Mar??a Paz,ACTOR,Lokillo,the movie,2021,['comedy'],3.8,68.0
85575,85575,Luz Stella Luengas,Karen Bayona,ACTOR,Lokillo,the movie,2021,['comedy'],3.8,68.0
85576,85576,In??s Prieto,Fanny,ACTOR,Lokillo,the movie,2021,['comedy'],3.8,68.0
85577,85577,Isabel Gaona,Cacica,ACTOR,Lokillo,MOVIE,2021,['comedy'],3.8,68.0


In [110]:
# Lets ensure we dont have any missing values now that we dropped all rows containing missing values
df.isna().sum()

name            0
character       0
role            0
title           0
type            0
release_year    0
genres          0
imdb_score      0
imdb_votes      0
Unnamed: 9      0
dtype: int64

We also need to check for and process duplicate values:

In [111]:
# Lets get the sum of our duplicates
df.duplicated().sum()

np.int64(0)