# Netflix EDA

Este proyecto intenta mostrar como realizar un analisis temporal de un dataset sobre series y peliculas de Netflix utilizando únicamente python.

Para este proyecto se utilizan las bibliotecas:
- pandas
- matplotlib
- plotly

Y los insights producto del analisis son:
- 
- 
- 

In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

## Carga De Datos

In [55]:
df = pd.read_csv("../data/titles.csv")
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


In [56]:
df.shape

(5850, 15)

In [57]:
df.dtypes

id                       object
title                    object
type                     object
description              object
release_year              int64
age_certification        object
runtime                   int64
genres                   object
production_countries     object
seasons                 float64
imdb_id                  object
imdb_score              float64
imdb_votes              float64
tmdb_popularity         float64
tmdb_score              float64
dtype: object

In [58]:
df.isna().sum()

id                         0
title                      1
type                       0
description               18
release_year               0
age_certification       2619
runtime                    0
genres                     0
production_countries       0
seasons                 3744
imdb_id                  403
imdb_score               482
imdb_votes               498
tmdb_popularity           91
tmdb_score               311
dtype: int64

## Limpieza de datos

In [60]:
# Las peliculas tienen como seasons el valor NaN, pero creo más correcto que sea el valor cero
df.loc[df['type'] == 'MOVIE', 'seasons'] = 0

# Además las series usan un flotante para indicar la cantidad de temporadas
df['seasons'] = df['seasons'].fillna(1).astype(int)

In [61]:
# El dataset tambien contiene 2619 valores faltantes para la certificacion de edad, que pasare a tratar como "Desconocida"
df['age_certification'] = df['age_certification'].fillna('Desconocida')

In [62]:
# imdb_score, imdb_votes, tmdb_popularity y tmdb_score también cuentan con una buena cantidad de valores nulos
# para conservar todos los registros voy a optar por llenar esos campos con la media de cada columna
# Si bien todos estos representan el 10% de todos los registros, prefiero conservarlos para el analisis posterior

for campo in ['imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']:
    media = df[campo].mean()
    df[campo] = df[campo].fillna(media)

In [63]:
# Tras la limpieza queda verificar cuantos de los registros son buenos para el analisis
df.isna().sum()

id                        0
title                     1
type                      0
description              18
release_year              0
age_certification         0
runtime                   0
genres                    0
production_countries      0
seasons                   0
imdb_id                 403
imdb_score                0
imdb_votes                0
tmdb_popularity           0
tmdb_score                0
dtype: int64

In [65]:
#El dataset quedo muy bien, solo resta dropear un registro sin titulo, y añadir descripciones a los registros que falten
df = df.dropna(subset='title')
df['description'] = df['description'].fillna("Sin descripcion")

In [66]:
df.isna().sum()

id                        0
title                     0
type                      0
description               0
release_year              0
age_certification         0
runtime                   0
genres                    0
production_countries      0
seasons                   0
imdb_id                 403
imdb_score                0
imdb_votes                0
tmdb_popularity           0
tmdb_score                0
dtype: int64

In [67]:
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1,,6.510861,23439.382474,0.6,6.829175
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],0,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],0,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,Desconocida,150,"['war', 'action']","['GB', 'US']",0,tt0061578,7.7,72662.0,20.398,7.6
