# 1. Cargando Datos

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Definir un dataframe para llamar el dataset
df = pd.read_csv("./data/disney_movies.csv")

In [3]:
#Ejecutar
df.head()

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,1937-12-21,Musical,G,184925485,5228953251
1,Pinocchio,1940-02-09,Adventure,G,84300000,2188229052
2,Fantasia,1940-11-13,Musical,G,83320000,2187090808
3,Song of the South,1946-11-12,Adventure,G,65000000,1078510579
4,Cinderella,1950-02-15,Drama,G,85000000,920608730


# 2. Explorando Datos

In [4]:
# ¿Qué columnas tienen los datos?
df.columns

Index(['movie_title', 'release_date', 'genre', 'mpaa_rating', 'total_gross',
       'inflation_adjusted_gross'],
      dtype='object')

In [5]:
# ¿Qué tamaño tienen los datos?
df.shape

(579, 6)

In [6]:
# ¿Existen valores nulos en los datos?
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 579 entries, 0 to 578
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   movie_title               579 non-null    object
 1   release_date              579 non-null    object
 2   genre                     562 non-null    object
 3   mpaa_rating               523 non-null    object
 4   total_gross               579 non-null    int64 
 5   inflation_adjusted_gross  579 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 27.3+ KB


In [7]:
# ¿Cómo se distribuyen las variables numéricas?
df.describe()

Unnamed: 0,total_gross,inflation_adjusted_gross
count,579.0,579.0
mean,64701790.0,118762500.0
std,93013010.0,286085300.0
min,0.0,0.0
25%,12788860.0,22741230.0
50%,30702450.0,55159780.0
75%,75709030.0,119202000.0
max,936662200.0,5228953000.0


In [8]:
# ¿Cómo se comportan las variables categóricas?
df.describe(include=['O'])

Unnamed: 0,movie_title,release_date,genre,mpaa_rating
count,579,579,562,523
unique,573,553,12,5
top,The Jungle Book,1997-12-25,Comedy,PG
freq,3,3,182,187


# 3. Limpiando los Datos

In [9]:
# Eliminar filas duplicadas
df.drop_duplicates()

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,1937-12-21,Musical,G,184925485,5228953251
1,Pinocchio,1940-02-09,Adventure,G,84300000,2188229052
2,Fantasia,1940-11-13,Musical,G,83320000,2187090808
3,Song of the South,1946-11-12,Adventure,G,65000000,1078510579
4,Cinderella,1950-02-15,Drama,G,85000000,920608730
...,...,...,...,...,...,...
574,The Light Between Oceans,2016-09-02,Drama,PG-13,12545979,12545979
575,Queen of Katwe,2016-09-23,Drama,PG,8874389,8874389
576,Doctor Strange,2016-11-04,Adventure,PG-13,232532923,232532923
577,Moana,2016-11-23,Adventure,PG,246082029,246082029


In [10]:
# Comprobando la película repetida
title = df[ df["movie_title"] == "The Jungle Book"]
title

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
13,The Jungle Book,1967-10-18,Musical,Not Rated,141843000,789612346
194,The Jungle Book,1994-12-25,Adventure,PG,44342956,88930321
567,The Jungle Book,2016-04-15,Adventure,PG,364001123,364001123


In [11]:
# Visualizando las 10 primeras filas de datos
df.head(10)

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,1937-12-21,Musical,G,184925485,5228953251
1,Pinocchio,1940-02-09,Adventure,G,84300000,2188229052
2,Fantasia,1940-11-13,Musical,G,83320000,2187090808
3,Song of the South,1946-11-12,Adventure,G,65000000,1078510579
4,Cinderella,1950-02-15,Drama,G,85000000,920608730
5,"20,000 Leagues Under the Sea",1954-12-23,Adventure,,28200000,528279994
6,Lady and the Tramp,1955-06-22,Drama,G,93600000,1236035515
7,Sleeping Beauty,1959-01-29,Drama,,9464608,21505832
8,101 Dalmatians,1961-01-25,Comedy,G,153000000,1362870985
9,The Absent Minded Professor,1961-03-16,Comedy,,25381407,310094574


In [12]:
# Aplicando el método de copiar el dato de la fila siguiente para rellenar nulos
df.mpaa_rating = df.mpaa_rating.fillna(method='bfill')
df.head(10)

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,1937-12-21,Musical,G,184925485,5228953251
1,Pinocchio,1940-02-09,Adventure,G,84300000,2188229052
2,Fantasia,1940-11-13,Musical,G,83320000,2187090808
3,Song of the South,1946-11-12,Adventure,G,65000000,1078510579
4,Cinderella,1950-02-15,Drama,G,85000000,920608730
5,"20,000 Leagues Under the Sea",1954-12-23,Adventure,G,28200000,528279994
6,Lady and the Tramp,1955-06-22,Drama,G,93600000,1236035515
7,Sleeping Beauty,1959-01-29,Drama,G,9464608,21505832
8,101 Dalmatians,1961-01-25,Comedy,G,153000000,1362870985
9,The Absent Minded Professor,1961-03-16,Comedy,G,25381407,310094574


In [13]:
df.genre.describe()

count        562
unique        12
top       Comedy
freq         182
Name: genre, dtype: object

In [17]:
# Eliminando filas con datos nulos para género de película
df = df.dropna()

In [20]:
# Visualizando datos con ceros en la taquilla
df[ df["total_gross"] == 0]

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
27,Amy,1981-03-20,Drama,PG,0,0
29,Condorman,1981-08-07,Action,PG,0,0


In [21]:
# Visualizando datos con ceros en la inflación ajustada
df[ df["inflation_adjusted_gross"] == 0]

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
27,Amy,1981-03-20,Drama,PG,0,0
29,Condorman,1981-08-07,Action,PG,0,0


In [28]:
# INTENTANDO ELIMINAR O REEMPLAZAR CEROS 
df.total_gross = df.total_gross.fillna(df.total_gross.mean())

df[20:25]

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
21,The Rescuers,1977-06-22,Adventure,PG,48775599,159743914
27,Amy,1981-03-20,Drama,PG,0,0
28,The Fox and the Hound,1981-07-10,Comedy,PG,43899231,133118889
29,Condorman,1981-08-07,Action,PG,0,0
30,Night Crossing,1982-02-05,Drama,PG,4500000,12903059


In [None]:
df.inflation_adjusted_gross = df.inflation_adjusted_gross.fillna(df.inflation_adjusted_gross.mean())


## 2. Aplicar un método para limpiar nulos

In [None]:
df.mpaa_rating = df.mpaa_rating.fillna(method='bfill')
df.genre = df.genre.fillna(method='bfill')
df.head(10)

# Organizar Datos

In [None]:
# Colocar la columna de fecha como índice
df.set_index("release_date").sort_values(by="release_date")


# Filtrar Datos

In [None]:
# Filtrar datos por género 
musical = df[ df["genre"] == "Musical"]
musical = musical.set_index("release_date")
musical = musical.sort_values(by="release_date")
musical

In [None]:
plt.hist(df['total_gross'], 15, color="yellow", ec="black")