# Data Cleaning

## Imports

In [37]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Movies Duration

In this first part, we clean the database and transform it in order to export a much smaller csv file that will be used in the Streamlit app to plot the graph.

In [17]:
# We first load the dataset into a dataframe
# Due to its size, we isolate this step into one independent cell

basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [18]:
# We filter out the adult category, and select only the movies

basics_df_movies = basics_df[basics_df['isAdult'] == '0']
basics_df_movies =  basics_df_movies[basics_df_movies['titleType'] == 'movie']

# We replace the missing values by a proper NaN

basics_df_movies_nan = basics_df_movies.replace('\\N', np.nan)

# We remove the columns we don't need anymore and drop the missing values

runtime = basics_df_movies_nan[['startYear', 'runtimeMinutes']]
runtime_dropna = runtime.dropna()


In [19]:
# We change the type of those columns to integers and set boundaries to the movies duration

runtime_int = runtime_dropna.astype({'runtimeMinutes': int})
runtime_int = runtime_int.astype({'startYear': int})
runtime_int = runtime_int[(runtime_int['runtimeMinutes'] >= 58) & (runtime_int['runtimeMinutes'] <= 270)]

# We limit the date range to the movies that were released between the years 1918 and 2021

runtime_int = runtime_int[(runtime_int['startYear'] >= 1918) & (runtime_int['startYear'] <= 2021)]

# We group by year and get the mean of the duration of the movies for each year, rounded to 2 decimals

runtime_mean = runtime_int.groupby('startYear').mean()
runtime_mean = round(runtime_mean['runtimeMinutes'], 2)


In [20]:
# We can now plot the results using plotly

fig = px.line(
    runtime_mean,
    y='runtimeMinutes',
    title='Average movie duration per year',
    line_shape='spline',
    labels={'startYear': 'Year', 'runtimeMinutes': 'Movie Duration in Minutes'},
    color_discrete_sequence=['green']    
)

fig.update_layout(
    width=1300,
    height=600,
    template='plotly_dark'
)
fig.show()

In [42]:
# We can now export the cleaned dataframe to a csv file

runtime_mean.to_csv('data/movies_duration.csv.zip', compression='zip')

## Top Rated Movies

In [22]:
# We first load the required dataset into a dataframe

ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [23]:
# The title.basics dataset is already loaded and filtered on non adult movies, we can select only the relevant columns

basics_df_stripped = basics_df_movies[['tconst', 'primaryTitle', 'startYear', 'genres']]

# We can then merge the two dataframes

movies_ratings = pd.merge(basics_df_stripped, ratings_df, how='inner', left_on='tconst', right_on='tconst')
movies_ratings.reset_index(drop=True, inplace=True)

In [24]:
# We finally filter the dataframe based on the rating and the number of votes
# This ensures that we only keep the very best movies that were decently popular
# This also removes possible outliers with very few votes that could be heavily biased

movies_ratings_top = movies_ratings[movies_ratings['averageRating'] >= 8.4]
movies_ratings_top = movies_ratings_top[movies_ratings_top['numVotes'] >= 20000]

# Finally, we split the genres column into two in order to only use the main genre of a movie

movies_ratings_top[['mainGenre', 'secondaryGenres']] = movies_ratings['genres'].str.split(',', n=1, expand=True)


In [25]:
# Let's plot the results with  a 3D scatter plot first

fig = px.scatter_3d(
    movies_ratings_top, 
    x='startYear', 
    y='averageRating', 
    z='mainGenre', 
    color='averageRating', 
    size='numVotes', 
    opacity = 0.8,
    labels={
        'startYear': 'Year',
        'averageRating': 'Rating',
        'mainGenre': 'Genre',
        'numVotes': 'Number of Votes'
    }, 
    size_max=25,    
    hover_name='primaryTitle'
)


fig.update_layout(
    width=1300,
    height=1000,
    scene=dict(zaxis=dict(nticks=11)),
    title='IMDB Top Rated Movies (>= 8.4) per Genre, Number of Votes and Year',
    template='plotly_dark'
)
fig.show()

In [26]:
# Then let's use a bar chart to see the distribution of the genres

fig = px.histogram(
    data_frame=movies_ratings_top,
    x='mainGenre',
    color='mainGenre',
    labels={'mainGenre': 'Genre'},
    color_discrete_sequence=px.colors.qualitative.Pastel
)

fig.update_layout(
    width=1300,
    height=600,
    title='IMDB Top Rated Movies (>= 8.4) Genre Distribution',
    template='plotly_dark'
)
fig.show()

In [43]:
# We can now export the dataframe as a new csv file

movies_ratings_top.to_csv('data/movies_ratings.csv.zip', compression='zip')

## Most Active Actors

In [29]:
# We load two other datasets into dataframes

name_df = pd.read_csv("https://datasets.imdbws.com/name.basics.tsv.gz", sep="\t")
principals_df = pd.read_csv("https://datasets.imdbws.com/title.principals.tsv.gz", sep="\t")

In [30]:
# We now select only the actors and actresses

actors = principals_df[(principals_df['category'] == 'actor') | (principals_df['category'] == 'actress')]

# And then we merge that dataframe with the names

list_actors = pd.merge(actors, name_df, how='inner')

In [31]:
# We can use again the title basics dataframe for another merge

cols = ['tconst', 'startYear','titleType', 'isAdult', 'runtimeMinutes','genres']
actors_titleType = pd.merge(list_actors, basics_df[cols], how='inner')

In [32]:
# Let's remove again the adult movies, although we will see in the analysis that this is not perfect

actors_titleType_no_adult = actors_titleType[actors_titleType['isAdult']=='0']

# We now remove all the genres other than movies, replace missing values with nan, and drop columns

actors_movies = actors_titleType_no_adult[(actors_titleType_no_adult['titleType']=='movie')]
actors_movies_nan = actors_movies.replace('\\N', pd.NaT)
actors_movies_filtre = actors_movies_nan[['primaryName','runtimeMinutes','startYear']]

# More cleaning to remove rows with missing values, convert duration and year to integers

actors_movies_clean = actors_movies_filtre.dropna()
actors_movies_clean = actors_movies_clean.astype({'startYear': int})
actors_movies_clean = actors_movies_clean.astype({'runtimeMinutes': int})

# We use the same filter for the movies duration, and we set the first year to 1920 in order to keep proper decades

actors_movies_clean = actors_movies_clean[(actors_movies_clean['runtimeMinutes'] >= 58) & (actors_movies_clean['runtimeMinutes'] <= 270)]
actors_movies_year = actors_movies_clean.loc[actors_movies_clean['startYear'] >= 1920]
actors_movies_year_final =  actors_movies_year[['primaryName', 'startYear']]

In [33]:
# We can now plot the top 5 most productive actors in movies per decade

depart = 1920
fin = 1929
subplot = []
for i in range(11):
    actors_movies_decade = actors_movies_year[(actors_movies_year['startYear'] >= depart)&(actors_movies_year['startYear'] <= fin)]
    temp = actors_movies_decade['primaryName'].value_counts()[:5].rename_axis('name').reset_index(name='count')
    subplot.append(temp)
    depart += 10
    fin += 10
globa = actors_movies_year['primaryName'].value_counts()[:5].rename_axis('name').reset_index(name='count')


fig = make_subplots(
    rows=4, cols=3,
    subplot_titles=('1920-1929', '1930-1939','1940-1949','1950-1959','1960-1969','1970-1979','1980-1989','1990-1999','2000-2009','2010-2019','2020-2029','toutes périodes confondues'),
    )

fig.append_trace(
    go.Bar(x=subplot[0]['name'],
    y=subplot[0]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=1, col=1
)

fig.append_trace(
    go.Bar(x=subplot[1]['name'],
    y=subplot[1]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=1, col=2
)

fig.append_trace(
    go.Bar(x=subplot[2]['name'],
    y=subplot[2]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=1, col=3
)

fig.append_trace(
    go.Bar(x=subplot[3]['name'],
    y=subplot[3]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=2, col=1
)

fig.append_trace(
    go.Bar(x=subplot[4]['name'],
    y=subplot[4]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=2, col=2
)

fig.append_trace(
    go.Bar(x=subplot[5]['name'],
    y=subplot[5]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=2, col=3
)

fig.append_trace(
    go.Bar(x=subplot[6]['name'],
    y=subplot[6]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=3, col=1
)

fig.append_trace(
    go.Bar(x=subplot[7]['name'],
    y=subplot[7]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=3, col=2
)

fig.append_trace(
    go.Bar(x=subplot[8]['name'],
    y=subplot[8]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=3, col=3
)

fig.append_trace(
    go.Bar(x=subplot[9]['name'],
    y=subplot[9]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=4, col=1
)

fig.append_trace(
    go.Bar(x=subplot[10]['name'],
    y=subplot[10]['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=4, col=2
)

fig.append_trace(
    go.Bar(x=globa['name'],
    y=globa['count'],
    marker_color=px.colors.qualitative.Plotly),
    row=4, col=3
)

fig.update_layout(
    template='plotly_dark',
    title='5 most present actors in movies per decade',
    showlegend=False,
    height = 1250,
    width=1000
)
fig.show()

In [None]:
# We can now export the file as a csv

actors_movies_year.to_csv('actors_movies_year.csv.zip', compression='zip')

In [None]:
# We will now repeat the operations above to get the actors from series

actors_series = actors_titleType.loc[actors_titleType['titleType']=='tvSeries']