In [15]:
import os

import pandas as pd
import numpy as np

import plotly.express as px

In [2]:
files_path = "./ml-latest-small"

links_df = pd.read_csv(os.path.join(files_path, "links.csv"))
movies_df = pd.read_csv(os.path.join(files_path, "movies.csv"))
ratings_df = pd.read_csv(os.path.join(files_path, "ratings.csv"))
tags_df = pd.read_csv(os.path.join(files_path, "tags.csv"))

In [3]:
for name, df in zip(['links', 'movies', 'ratings', 'tags'], 
              [links_df, movies_df, ratings_df, tags_df]):

    print(f'{name} shape: {df.shape}')
    display(df.head())
    print()


links shape: (9742, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0



movies shape: (9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



ratings shape: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931



tags shape: (3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200





In [4]:
ratings_df.loc[ratings_df.movieId == 1].userId.value_counts()

userId
1      1
380    1
382    1
385    1
389    1
      ..
214    1
216    1
217    1
219    1
610    1
Name: count, Length: 215, dtype: int64

In [5]:
movies_df.rename({"genres": "general_genres"}, axis=1, inplace=True)
movies_df['genres'] = movies_df['general_genres'].str.split('|')
movies_df = movies_df.explode('genres')

def cat_year(row):
    try:
        return int( row[-6:].replace("(", "").replace(")", "") )
    except:
        return np.nan
    
movies_df["year"] = movies_df.title.apply(cat_year)
movies_df['title'] = movies_df['title'].str.replace(r"\(.*\)", "", regex=True).str.strip()

In [6]:
movies_df.head()

Unnamed: 0,movieId,title,general_genres,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Adventure,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Animation,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Children,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Comedy,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Fantasy,1995.0


In [7]:
movies_df.loc[movies_df.genres == "(no genres listed)"]

Unnamed: 0,movieId,title,general_genres,genres,year
8517,114335,La cravate,(no genres listed),(no genres listed),1957.0
8684,122888,Ben-hur,(no genres listed),(no genres listed),2016.0
8687,122896,Pirates of the Caribbean: Dead Men Tell No Tales,(no genres listed),(no genres listed),2017.0
8782,129250,Superfast!,(no genres listed),(no genres listed),2015.0
8836,132084,Let It Be Me,(no genres listed),(no genres listed),1995.0
8902,134861,Trevor Noah: African American,(no genres listed),(no genres listed),2013.0
9033,141131,Guardians,(no genres listed),(no genres listed),2016.0
9053,141866,Green Room,(no genres listed),(no genres listed),2015.0
9070,142456,The Brand New Testament,(no genres listed),(no genres listed),2015.0
9091,143410,Hyena Road,(no genres listed),(no genres listed),


In [8]:
movies_df.year.isna().sum()

# movies_df.to_csv("./movies_adj.csv", index=False, decimal=".")

16

In [9]:
ratings_adj_df = ratings_df.merge(movies_df[["movieId", "title", "year"]].drop_duplicates(), 
                                  on="movieId", how="inner")

display( ratings_adj_df.shape, ratings_df.shape )

display( ratings_adj_df.head() )

(100836, 6)

(100836, 4)

Unnamed: 0,userId,movieId,rating,timestamp,title,year
0,1,1,4.0,964982703,Toy Story,1995.0
1,5,1,4.0,847434962,Toy Story,1995.0
2,7,1,4.5,1106635946,Toy Story,1995.0
3,15,1,2.5,1510577970,Toy Story,1995.0
4,17,1,4.5,1305696483,Toy Story,1995.0


In [10]:
display( ratings_adj_df.describe() )

Unnamed: 0,userId,movieId,rating,timestamp,year
count,100836.0,100836.0,100836.0,100836.0,100818.0
mean,326.127564,19435.295718,3.501557,1205946000.0,1994.44499
std,182.618491,35530.987199,1.042529,216261000.0,14.361424
min,1.0,1.0,0.5,828124600.0,1902.0
25%,177.0,1199.0,3.0,1019124000.0,1990.0
50%,325.0,2991.0,3.5,1186087000.0,1997.0
75%,477.0,8122.0,4.0,1435994000.0,2003.0
max,610.0,193609.0,5.0,1537799000.0,2018.0


In [11]:
def quality(rating):

    if rating <= 2:
        return "not good"
    elif rating > 2 and rating <= 3:
        return "not so bad"
    elif rating > 3 and rating <= 4:
        return "good"
    else:
        return "very good"
    
rating_agg_df = ratings_adj_df.groupby("title").agg({
                    "rating": ["mean", "max", "min", "median"]
                }).reset_index()
rating_agg_df.columns = ["title", "rating_mean", "rating_max", "rating_min", "rating_median"]
rating_agg_df = rating_agg_df.sort_values(by="rating_mean", ascending=False)
rating_agg_df["quality_mean"] = rating_agg_df.rating_mean.apply(quality)

display( rating_agg_df.head() )

# rating_agg_df.to_csv("./rating_agg_infos.csv", index=False, decimal=".")

Unnamed: 0,title,rating_mean,rating_max,rating_min,rating_median,quality_mean
4529,Karlson Returns,5.0,5.0,5.0,5.0,very good
225,Adventures Of Sherlock Holmes And Dr. Watson: ...,5.0,5.0,5.0,5.0,very good
4512,Justice League: Doom,5.0,5.0,5.0,5.0,very good
2659,English Vinglish,5.0,5.0,5.0,5.0,very good
4489,Junior and Karlson,5.0,5.0,5.0,5.0,very good


In [12]:
tags_adj_df = tags_df.merge(movies_df[["movieId", "title"]].drop_duplicates(), 
                                  on=["movieId"], how="inner")

display( tags_adj_df.shape, tags_df.shape )

display( tags_adj_df.head() )

# tags_adj_df.to_csv("./tags_adj.csv", index=False, decimal=".")

(3683, 5)

(3683, 4)

Unnamed: 0,userId,movieId,tag,timestamp,title
0,2,60756,funny,1445714994,Step Brothers
1,2,60756,Highly quotable,1445714996,Step Brothers
2,2,60756,will ferrell,1445714992,Step Brothers
3,62,60756,comedy,1528934384,Step Brothers
4,62,60756,funny,1528934381,Step Brothers


In [13]:
tags_adj_df.title.unique()

array(['Step Brothers', 'Warrior', 'Wolf of Wall Street, The', ...,
       'Night of the Shooting Stars', 'Shame', 'Hard-Boiled'],
      dtype=object)

In [14]:
tags_agg_df = tags_adj_df.groupby(["title", "tag"]).movieId.count().reset_index()

display( tags_agg_df.loc[tags_agg_df.title == "Step Brothers"] )

Unnamed: 0,title,tag,movieId
3014,Step Brothers,Highly quotable,1
3015,Step Brothers,comedy,1
3016,Step Brothers,funny,3
3017,Step Brothers,will ferrell,3


## Analysing the data

In [16]:
movies_df.head()

Unnamed: 0,movieId,title,general_genres,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Adventure,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Animation,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Children,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Comedy,1995.0
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Fantasy,1995.0


In [17]:
genres_df = movies_df.genres.value_counts().reset_index()

display( genres_df )

Unnamed: 0,genres,count
0,Drama,4361
1,Comedy,3756
2,Thriller,1894
3,Action,1828
4,Romance,1596
5,Adventure,1263
6,Crime,1199
7,Sci-Fi,980
8,Horror,978
9,Fantasy,779


In [18]:
top5genres_freq = genres_df.sort_values("count", ascending=False).head(5).genres.values
display( top5genres_freq )

array(['Drama', 'Comedy', 'Thriller', 'Action', 'Romance'], dtype=object)

In [65]:
# Qual a frequencia dos generos que mais aparece na base do IMDB?
fig = px.bar(genres_df, x="genres", y="count", 
             title="Frequência de gênero de filmes da base IMDB",
             text="count")

fig.update_traces(textposition='auto')

fig.update_layout(template='ggplot2')

fig.update_xaxes(title_text="Gênero")
fig.update_yaxes(title_text="Frequência")

fig.show()

In [20]:
top_movies_genrer_df = movies_df.loc[movies_df.genres.isin(top5genres_freq), :]
display( top_movies_genrer_df.shape ) 

top_movies_genrer_df = top_movies_genrer_df.merge(rating_agg_df, on="title", how="left")
top_movies_genrer_df.drop("movieId", axis=1, inplace=True)
display( top_movies_genrer_df.shape ) 

display( top_movies_genrer_df.head(10) )

(13435, 5)

(13435, 9)

Unnamed: 0,title,general_genres,genres,year,rating_mean,rating_max,rating_min,rating_median,quality_mean
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Comedy,1995.0,3.92093,5.0,0.5,4.0,good
1,Grumpier Old Men,Comedy|Romance,Comedy,1995.0,3.259615,5.0,0.5,3.0,good
2,Grumpier Old Men,Comedy|Romance,Romance,1995.0,3.259615,5.0,0.5,3.0,good
3,Waiting to Exhale,Comedy|Drama|Romance,Comedy,1995.0,2.357143,3.0,1.0,3.0,not so bad
4,Waiting to Exhale,Comedy|Drama|Romance,Drama,1995.0,2.357143,3.0,1.0,3.0,not so bad
5,Waiting to Exhale,Comedy|Drama|Romance,Romance,1995.0,2.357143,3.0,1.0,3.0,not so bad
6,Father of the Bride Part II,Comedy,Comedy,1995.0,3.071429,5.0,0.5,3.0,good
7,Heat,Action|Crime|Thriller,Action,1995.0,3.946078,5.0,1.0,4.0,good
8,Heat,Action|Crime|Thriller,Thriller,1995.0,3.946078,5.0,1.0,4.0,good
9,Sabrina,Comedy|Romance,Comedy,1995.0,3.392857,5.0,1.0,3.5,good


In [24]:
dfs = []
for g in top5genres_freq:

    df = top_movies_genrer_df.loc[top_movies_genrer_df.genres == g, :].copy()
    df = df.sort_values("rating_median", ascending=False)[["title", "genres", "rating_median", "year"]].head(5)

    dfs.append(df)

movies_by_genres_df = pd.concat(dfs)
movies_by_genres_df.title = movies_by_genres_df.title.apply(lambda row: row[:-6])
movies_by_genres_df['title'] = movies_by_genres_df['title'].str.replace(r"\(.*\)", "", regex=True)

display( movies_by_genres_df.head(10) )

Unnamed: 0,title,genres,rating_median,year
12951,SORI: Voice from the,Drama,5.0,2016.0
7907,'Salem,Drama,5.0,2004.0
2162,Me,Drama,5.0,1981.0
3904,Duel in t,Drama,5.0,1946.0
3909,Five Sense,Drama,5.0,1999.0
12194,,Comedy,5.0,2014.0
12286,The Fox and the H,Comedy,5.0,2006.0
12401,Saving,Comedy,5.0,2013.0
12398,Love and P,Comedy,5.0,1985.0
12396,George Carlin: Life Is Worth,Comedy,5.0,2005.0


In [66]:
# Rating baseado na mediana
fig = px.treemap(movies_by_genres_df, path=['genres', 'title'], 
                 title='Distribuição dos melhores filmes por Top 5 gêneros mais frequentes')

fig.update_layout(
    height=700, 
    width=1100,
    annotations=[
        dict(
            x=0,
            y=-0.07,
            showarrow=False,
            text="Nota: As notas dos filmes são tomadas com base na mediana.",
            xref="paper",
            yref="paper"
        )
    ]
)

fig.show()

In [32]:
top_movies = rating_agg_df.sort_values('rating_median', ascending=False).head(20).title.values
top_movies_df = movies_df.loc[movies_df.title.isin(top_movies), :].copy()
top_movies_df.year = top_movies_df.year.astype("int")

display( top_movies_df )

Unnamed: 0,movieId,title,general_genres,genres,year
2749,3687,Light Years,Adventure|Animation|Fantasy|Sci-Fi,Adventure,1988
2749,3687,Light Years,Adventure|Animation|Fantasy|Sci-Fi,Animation,1988
2749,3687,Light Years,Adventure|Animation|Fantasy|Sci-Fi,Fantasy,1988
2749,3687,Light Years,Adventure|Animation|Fantasy|Sci-Fi,Sci-Fi,1988
2936,3939,Slumber Party Massacre II,Horror,Horror,1987
2937,3940,Slumber Party Massacre III,Horror,Horror,1990
3110,4180,Reform School Girls,Action|Drama,Action,1986
3110,4180,Reform School Girls,Action|Drama,Drama,1986
3974,5607,Son of the Bride,Comedy|Drama,Comedy,2001
3974,5607,Son of the Bride,Comedy|Drama,Drama,2001


In [67]:
# Rating baseado na mediana
fig = px.treemap(top_movies_df, path=["year", "title", "genres"], 
                 title="Distribuição de generos por filmes para os Top 20 filmes em cada ano")

fig.update_layout(
    height=700, 
    width=1100,
    annotations=[
        dict(
            x=0,
            y=-0.07,
            showarrow=False,
            text="Nota: As notas dos filmes são tomadas com base na mediana. Podemos ter filmes com a mesma nota não listados aqui.",
            xref="paper",
            yref="paper"
        )
    ]
)

fig.show()

In [68]:
# Quais os piores filmes?

worst_movies = rating_agg_df.sort_values('rating_median', ascending=True).head(50)

# Add a new column 'title_length' to the DataFrame
worst_movies['title_length'] = worst_movies['title'].apply(len)

# Sort the DataFrame by 'title_length'
worst_movies = worst_movies.sort_values('title_length', ascending=True)

# Create the bar chart
fig = px.bar(worst_movies, x='title', y='rating_median', title='50 piores filmes avaliados')

# Set the style to ggplot2
fig.update_layout(template='ggplot2')
fig.update_layout(
    # height=700, 
    # width=1100,
    annotations=[
        dict(
            x=-0.083,
            y=-4.45,
            showarrow=False,
            text="Nota: As notas dos filmes são tomadas com base na mediana.",
            xref="paper",
            yref="paper"
        )
    ]
)

# Update x and y labels
fig.update_xaxes(title_text='Título do filme', tickangle=-90)
fig.update_yaxes(title_text='Nota')

# Show the plot
fig.show()



In [29]:
# Qual a quantidade de filmes produzidos ao longo dos anos? 
# O numero de avaliacoes tambem cresceu na mesma proporcao?

In [61]:
movies_per_year_df = movies_df.drop_duplicates().year.value_counts().reset_index()
movies_per_year_df.year = movies_per_year_df.year.astype("int")

rating_per_year_df = ratings_adj_df.groupby(["year"]).rating.median().reset_index()
rating_per_year_df.year = rating_per_year_df.year.astype("int")

movies_per_year_df = movies_per_year_df.merge(rating_per_year_df, on="year", how="left")

In [62]:
movies_per_year_df.isna().sum()

year      0
count     0
rating    0
dtype: int64

In [69]:
# A quantidade de filmes aumentou, mas e a qualidade dos mesmos? 
# Cresceu proporcionalmente?
fig = px.scatter(movies_per_year_df, x='year', y='count',
                 title="Evolução da produção de filmes ao longo dos anos", 
                 color="rating",
                 color_continuous_scale=px.colors.sequential.Plasma,
                 labels={"rating":"Nota"})

fig.update_layout(template="ggplot2")

fig.update_xaxes(title_text="Ano", tickangle=-45, dtick=4)
fig.update_yaxes(title_text="Quantidade")

fig.show()