In [1]:
import os

import pandas as pd
import numpy as np

In [2]:
files_path = "./ml-latest-small"

links_df = pd.read_csv(os.path.join(files_path, "links.csv"))
movies_df = pd.read_csv(os.path.join(files_path, "movies.csv"))
ratings_df = pd.read_csv(os.path.join(files_path, "ratings.csv"))
tags_df = pd.read_csv(os.path.join(files_path, "tags.csv"))

In [3]:
for name, df in zip(['links', 'movies', 'ratings', 'tags'], 
              [links_df, movies_df, ratings_df, tags_df]):

    print(f'{name} shape: {df.shape}')
    display(df.head())
    print()


links shape: (9742, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0



movies shape: (9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



ratings shape: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931



tags shape: (3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200





In [4]:
ratings_df.loc[ratings_df.movieId == 1].userId.value_counts()

userId
1      1
380    1
382    1
385    1
389    1
      ..
214    1
216    1
217    1
219    1
610    1
Name: count, Length: 215, dtype: int64

In [5]:
movies_df.rename({"genres": "general_genres"}, axis=1, inplace=True)
movies_df['genres'] = movies_df['general_genres'].str.split('|')
movies_df = movies_df.explode('genres')

def cat_year(row):
    try:
        return int( row[-6:].replace("(", "").replace(")", "") )
    except:
        return np.nan
    
movies_df["year"] = movies_df.title.apply(cat_year)

In [8]:
movies_df.head()

Unnamed: 0,movieId,title,general_genres,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Animation,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Children,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Comedy,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Fantasy,1995.0


In [9]:
movies_df.loc[movies_df.genres == "(no genres listed)"]

Unnamed: 0,movieId,title,general_genres,genres,year
8517,114335,La cravate (1957),(no genres listed),(no genres listed),1957.0
8684,122888,Ben-hur (2016),(no genres listed),(no genres listed),2016.0
8687,122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed),(no genres listed),2017.0
8782,129250,Superfast! (2015),(no genres listed),(no genres listed),2015.0
8836,132084,Let It Be Me (1995),(no genres listed),(no genres listed),1995.0
8902,134861,Trevor Noah: African American (2013),(no genres listed),(no genres listed),2013.0
9033,141131,Guardians (2016),(no genres listed),(no genres listed),2016.0
9053,141866,Green Room (2015),(no genres listed),(no genres listed),2015.0
9070,142456,The Brand New Testament (2015),(no genres listed),(no genres listed),2015.0
9091,143410,Hyena Road,(no genres listed),(no genres listed),


In [8]:
# movies_df.to_csv("./movies_adj.csv", index=False, decimal=".")

In [10]:
ratings_adj_df = ratings_df.merge(movies_df[["movieId", "title", "year"]].drop_duplicates(), 
                                  on="movieId", how="inner")

In [11]:
ratings_adj_df.shape, ratings_df.shape

((100836, 6), (100836, 4))

In [12]:
ratings_adj_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,year
0,1,1,4.0,964982703,Toy Story (1995),1995.0
1,5,1,4.0,847434962,Toy Story (1995),1995.0
2,7,1,4.5,1106635946,Toy Story (1995),1995.0
3,15,1,2.5,1510577970,Toy Story (1995),1995.0
4,17,1,4.5,1305696483,Toy Story (1995),1995.0


In [13]:
ratings_adj_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp,year
count,100836.0,100836.0,100836.0,100836.0,100818.0
mean,326.127564,19435.295718,3.501557,1205946000.0,1994.44499
std,182.618491,35530.987199,1.042529,216261000.0,14.361424
min,1.0,1.0,0.5,828124600.0,1902.0
25%,177.0,1199.0,3.0,1019124000.0,1990.0
50%,325.0,2991.0,3.5,1186087000.0,1997.0
75%,477.0,8122.0,4.0,1435994000.0,2003.0
max,610.0,193609.0,5.0,1537799000.0,2018.0


In [14]:
rating_agg_df = ratings_adj_df.groupby("title").agg({
                    "rating": ["mean", "max", "min", "median"]
                }).reset_index()
rating_agg_df.columns = ["title", "rating_mean", "rating_max", "rating_min", "rating_median"]
rating_agg_df = rating_agg_df.sort_values(by="rating_mean", ascending=False)

In [15]:
def quality(rating):

    if rating <= 2:
        return "not good"
    elif rating > 2 and rating <= 3:
        return "not so bad"
    elif rating > 3 and rating <= 4:
        return "good"
    else:
        return "very good"

rating_agg_df["quality_mean"] = rating_agg_df.rating_mean.apply(quality)

In [16]:
rating_agg_df.head()

Unnamed: 0,title,rating_mean,rating_max,rating_min,rating_median,quality_mean
3354,Gena the Crocodile (1969),5.0,5.0,5.0,5.0,very good
8951,True Stories (1986),5.0,5.0,5.0,5.0,very good
1986,Cosmic Scrat-tastrophe (2015),5.0,5.0,5.0,5.0,very good
5269,Love and Pigeons (1985),5.0,5.0,5.0,5.0,very good
7033,Red Sorghum (Hong gao liang) (1987),5.0,5.0,5.0,5.0,very good


In [16]:
# rating_agg_df.to_csv("./rating_agg_infos.csv", index=False, decimal=".")

In [17]:
tags_adj_df = tags_df.merge(movies_df[["movieId", "title"]].drop_duplicates(), 
                                  on=["movieId"], how="inner")

print( tags_adj_df.shape, tags_df.shape )

(3683, 5) (3683, 4)


In [18]:
tags_adj_df.head()

Unnamed: 0,userId,movieId,tag,timestamp,title
0,2,60756,funny,1445714994,Step Brothers (2008)
1,2,60756,Highly quotable,1445714996,Step Brothers (2008)
2,2,60756,will ferrell,1445714992,Step Brothers (2008)
3,62,60756,comedy,1528934384,Step Brothers (2008)
4,62,60756,funny,1528934381,Step Brothers (2008)


In [20]:
# tags_adj_df.to_csv("./tags_adj.csv", index=False, decimal=".")

In [19]:
tags_agg_df = tags_adj_df.groupby(["title", "tag"]).movieId.count().reset_index()

In [20]:
tags_agg_df.loc[tags_agg_df.title == "Step Brothers (2008)"]

Unnamed: 0,title,tag,movieId
3018,Step Brothers (2008),Highly quotable,1
3019,Step Brothers (2008),comedy,1
3020,Step Brothers (2008),funny,3
3021,Step Brothers (2008),will ferrell,3


In [21]:
# Analysing the data

import plotly.express as px

In [22]:
movies_df.head()

Unnamed: 0,movieId,title,general_genres,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Animation,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Children,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Comedy,1995.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Fantasy,1995.0


In [23]:
genres_df = movies_df.genres.value_counts().reset_index()

genres_df

Unnamed: 0,genres,count
0,Drama,4361
1,Comedy,3756
2,Thriller,1894
3,Action,1828
4,Romance,1596
5,Adventure,1263
6,Crime,1199
7,Sci-Fi,980
8,Horror,978
9,Fantasy,779


In [64]:
top5genres_freq = genres_df.sort_values("count", ascending=False).head(5).genres.values
top5genres_freq

array(['Drama', 'Comedy', 'Thriller', 'Action', 'Romance'], dtype=object)

In [24]:
# Qual a frequencia dos generos que mais aparece na base do IMDB?
fig = px.bar(genres_df, x="genres", y="count", 
             title="Frequencia de generos de filmes da base IMDB")
fig.update_layout(template='ggplot2')

fig.update_xaxes(title_text="Genero")
fig.update_yaxes(title_text="Frequencia")

fig.show()

In [69]:
top_movies_genrer_df = movies_df.loc[movies_df.genres.isin(top5genres_freq), :]
print(top_movies_genrer_df.shape) 

top_movies_genrer_df = top_movies_genrer_df.merge(rating_agg_df, on="title", how="left")
top_movies_genrer_df.drop("movieId", axis=1, inplace=True)
print(top_movies_genrer_df.shape) 

(13435, 5)
(13435, 9)


In [75]:
top5genres_freq

array(['Drama', 'Comedy', 'Thriller', 'Action', 'Romance'], dtype=object)

In [88]:
dfs = []
for g in top5genres_freq:

    df = top_movies_genrer_df.loc[top_movies_genrer_df.genres == g, :].copy()
    df = df.sort_values("rating_median", ascending=False)[["title", "genres", "rating_median", "year"]].head(5)

    dfs.append(df)

movies_by_genres_df = pd.concat(dfs)
movies_by_genres_df.title = movies_by_genres_df.title.apply(lambda row: row[:-6])
movies_by_genres_df['title'] = movies_by_genres_df['title'].str.replace(r"\(.*\)", "", regex=True)

movies_by_genres_df.head(10)

Unnamed: 0,title,genres,rating_median,year
5547,"Four Seasons, The",Drama,5.0,1981.0
10419,Enter the Void,Drama,5.0,2009.0
10606,Faster,Drama,5.0,2010.0
8036,"Marriage of Maria Braun, The",Drama,5.0,1979.0
2626,"Ballad of Narayama, The",Drama,5.0,1983.0
3375,Man Bites Dog,Comedy,5.0,1992.0
12784,Dragons: Gift of the Night Fury,Comedy,5.0,2011.0
11077,Miss Nobody,Comedy,5.0,2010.0
12775,Formula of Love,Comedy,5.0,1984.0
11089,Trailer Park Boys,Comedy,5.0,1999.0


In [94]:
fig = px.treemap(movies_by_genres_df, path=['genres', 'title'], 
                 title='Distribuicao dos melhores filmes por Top5 generos mais frequentes')

fig.update_layout(height=700, width=1100)
fig.show()

In [95]:
# Quais os piores filmes?

worst_movies = rating_agg_df.sort_values('rating_mean', ascending=True).head(50)
worst_movies['title'] = worst_movies['title'].str.replace(r"\(.*\)", "", regex=True)

# Create the bar chart
fig = px.bar(worst_movies, x='title', y='rating_median', title='50 piores filmes avaliados')

# Set the style to ggplot2
fig.update_layout(template='ggplot2')

# Update x and y labels
fig.update_xaxes(title_text='Titulo do filme', tickangle=-90)
fig.update_yaxes(title_text='Nota media')

# Show the plot
fig.show()


In [29]:
# Qual a quantidade de filmes produzidos ao longo dos anos? 
# O numero de avaliacoes tambem cresceu na mesma proporcao?

In [28]:
movies_df.loc[movies_df.title.str.contains("nlight")]

Unnamed: 0,movieId,title,general_genres,genres,year
3988,5625,Moonlight Mile (2002),Drama|Romance,Drama,2002.0
3988,5625,Moonlight Mile (2002),Drama|Romance,Romance,2002.0
8483,113225,Magic in the Moonlight (2014),Comedy|Drama|Romance,Comedy,2014.0
8483,113225,Magic in the Moonlight (2014),Comedy|Drama|Romance,Drama,2014.0
8483,113225,Magic in the Moonlight (2014),Comedy|Drama|Romance,Romance,2014.0
9367,162414,Moonlight,Drama,Drama,


In [46]:
movies_per_year_df = movies_df.drop_duplicates().year.value_counts().reset_index()
movies_per_year_df.year = movies_per_year_df.year.astype("int")

In [47]:
rating_per_year_df = ratings_adj_df.groupby(["year"]).rating.median().reset_index()
rating_per_year_df.year = rating_per_year_df.year.astype("int")

In [48]:
rating_per_year_df

Unnamed: 0,year,rating
0,1902,3.5
1,1903,2.5
2,1908,4.0
3,1915,2.0
4,1916,3.5
...,...,...
101,2014,3.5
102,2015,3.5
103,2016,3.5
104,2017,4.0


In [49]:
movies_per_year_df = movies_per_year_df.merge(rating_per_year_df, on="year", how="left")
movies_per_year_df.rename({"rating"})

In [50]:
movies_per_year_df.isna().sum()

year      0
count     0
rating    0
dtype: int64

In [58]:
# A quantidade de filmes aumentou, mas e a qualidade dos mesmos? 
# Cresceu proporcionalmente?
fig = px.scatter(movies_per_year_df, x='year', y='count',
                 title="Evolucao da producao de filmes ao longo dos anos", 
                 color="rating",
                 color_continuous_scale=px.colors.sequential.Plasma,
                 labels={"rating":"Nota"})

fig.update_layout(template="ggplot2")

fig.update_xaxes(title_text="Ano", tickangle=-45, dtick=4)
fig.update_yaxes(title_text="Quantidade")

fig.show()