# Analisis exploratorio de las variables en relación a los Spoilers en comentarios de IMDB

## 1. Procesamiento de datos

In [1]:
# Importamos las librerias generales para el proyecto
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Lectura de los datos iniciales
movies_details = pd.read_json("./data/movies/IMDB_movie_details.json", lines=True)
movies_reviews = pd.read_json("./data/movies/IMDB_reviews.json", lines=True)
movies_info = pd.read_csv('./data/movies_info/IMDb movies.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Eliminamos columnas inncesarias o repetidas para hacer 
movies_details_pre = movies_details[['movie_id', 'rating','release_date']]
movies_info_pre = movies_info[['imdb_title_id', 'original_title',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics']]

In [4]:
# Combinamos los datasets para tener los datos completos
movies_data = movies_details_pre.merge(movies_info_pre, right_on="imdb_title_id", left_on="movie_id")

# Corregimos errores de repetición y de formato de los valores
movies_data = movies_data.drop(columns=["imdb_title_id"])
movies_data["budget"] = movies_data["budget"].str.split(" ", n = 1, expand = True)[1]
movies_data["budget"] = movies_data["budget"].astype('float64')
movies_data["usa_gross_income"] = movies_data["usa_gross_income"].str.split(" ", n = 1, expand = True)[1]
movies_data["usa_gross_income"] = movies_data["usa_gross_income"].astype('float64')
movies_data["worlwide_gross_income"] = movies_data["worlwide_gross_income"].str.split(" ", n = 1, expand = True)[1]
movies_data["worlwide_gross_income"] = movies_data["worlwide_gross_income"].astype('float64')

In [5]:
# Revisamos la existencia de valores NaN en las distintas columnas
movies_data.isnull().any()

movie_id                 False
rating                   False
release_date             False
original_title           False
genre                    False
duration                 False
country                  False
language                 False
director                 False
writer                   False
production_company       False
actors                   False
votes                    False
budget                    True
usa_gross_income          True
worlwide_gross_income     True
metascore                 True
reviews_from_users        True
reviews_from_critics     False
dtype: bool

In [6]:
# Comprobamos la forma de los datos para rellenar los valores NaN
movies_data.describe()

Unnamed: 0,rating,duration,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
count,1534.0,1534.0,1534.0,1436.0,1476.0,1517.0,1431.0,1533.0,1534.0
mean,7.049022,116.722947,264761.1,64972360.0,90176590.0,201845800.0,62.636618,658.522505,224.512386
std,0.956515,22.759115,268268.1,276373900.0,97066650.0,256166900.0,17.657524,676.416985,162.416418
min,2.4,67.0,1413.0,6.0,509.0,576.0,9.0,5.0,1.0
25%,6.4,101.0,90804.25,15750000.0,25562400.0,35281790.0,50.0,256.0,106.25
50%,7.1,113.0,186388.5,35000000.0,58403910.0,115900900.0,63.0,468.0,178.0
75%,7.8,128.0,345549.8,79250000.0,124269000.0,264771000.0,75.0,828.0,311.75
max,9.5,321.0,2278845.0,10000000000.0,936662200.0,2790439000.0,100.0,8232.0,999.0


In [7]:
# Rellenamos los valores con la mediana para que no se vean afectados por los "outliers" y comprobamos nuevamente
movies_data = movies_data.fillna(movies_data.median())
movies_data.isnull().any()

  movies_data = movies_data.fillna(movies_data.median())


movie_id                 False
rating                   False
release_date             False
original_title           False
genre                    False
duration                 False
country                  False
language                 False
director                 False
writer                   False
production_company       False
actors                   False
votes                    False
budget                   False
usa_gross_income         False
worlwide_gross_income    False
metascore                False
reviews_from_users       False
reviews_from_critics     False
dtype: bool