# Librerias

In [99]:
# Comencemos importando nuestras librerias
import pandas as pd
import numpy as np
import os
import ast
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

pd.options.display.float_format = '{:.2f}'.format

# Presentación DataSet

In [100]:
nueva_ruta = 'C:\\Users\\ReyesLuis\\Downloads\\Curso Data Science\\Movie Recommendation System'
os.chdir(nueva_ruta)

# Levanto el archivo 
df_movies = pd.read_csv("tmdb_5000_movies.csv",sep = ',')
# Reordenemos el dataframe
df_movies = df_movies[['id', 'budget', 'genres', 'homepage', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries',
                'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count']]
# Importamos el segundo archivo
df_credits = pd.read_csv("tmdb_5000_credits.csv",sep = ',')

#Unamos los dos dataframes
df_movies = pd.merge(df_movies, df_credits, on="id", how="outer")

#Cambiemos el indice
df_movies.index = df_movies['id']
df_movies = df_movies.drop('id', axis='columns')

# Vamos algunos de sus datos
print(df_movies.head())


           budget                                             genres  \
id                                                                     
19995   237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
285     300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
206647  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
49026   250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
49529   260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                            homepage  \
id                                                     
19995                    http://www.avatarmovie.com/   
285     http://disney.go.com/disneypictures/pirates/   
206647   http://www.sonypictures.com/movies/spectre/   
49026             http://www.thedarkknightrises.com/   
49529           http://movies.disney.com/john-carter   

                                                 keywords original_language  \
id                     

In [101]:
#Descripción de las columnas
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4803 entries, 19995 to 25975
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   keywords              4803 non-null   object 
 4   original_language     4803 non-null   object 
 5   original_title        4803 non-null   object 
 6   overview              4800 non-null   object 
 7   popularity            4803 non-null   float64
 8   production_companies  4803 non-null   object 
 9   production_countries  4803 non-null   object 
 10  release_date          4802 non-null   object 
 11  revenue               4803 non-null   int64  
 12  runtime               4801 non-null   float64
 13  spoken_languages      4803 non-null   object 
 14  status                4803 non-null   object 
 15  tagline              

## Columnas 

* budget - El presupuesto con el que se hizo la película.
* genres - Los géneros de la película, acción, comedia, suspense, etc.
* homepage - Un enlace a la página de inicio de la película.
* keywords - Las palabras clave o etiquetas relacionadas con la película.
* original_language - El idioma en el que se hizo la película.
* original_title - El título de la película antes de la traducción o adaptación.
* overview - Una breve descripción de la película.
* popularity - Una cantidad numérica que especifica la popularidad de la película.
* production_companies - La casa productora de la película.
* production_countries - El país en el que se produjo.
* release_date - La fecha en que fue lanzada.
* revenue - Los ingresos mundiales generados por la película..
* runtime - El tiempo de ejecución de la película en minutos.
* status - "Released" or "Rumored".
* tagline - Eslogan de la película.
* title - Título de la película.
* vote_average - Calificaciones promedio que recibió la película.
* vote_count - el recuento de votos recibidos.
* cast - El nombre de los actores principales y secundarios.
* crew - El nombre del Director, Editor, Compositor, Escritor, etc.

In [102]:
#Como podemos apreciar tenemos dos columnas con los mismos datos luego de la unión. Eliminemos una de ellas.
print(df_movies[['title_x','title_y']].head())

#Eliminemos la columna repetida
df_movies = df_movies.drop(['title_y'], axis=1)

#Cambiamos el nombre
df_movies.rename(columns={'title_x': 'title'}, inplace=True)

                                         title_x  \
id                                                 
19995                                     Avatar   
285     Pirates of the Caribbean: At World's End   
206647                                   Spectre   
49026                      The Dark Knight Rises   
49529                                John Carter   

                                         title_y  
id                                                
19995                                     Avatar  
285     Pirates of the Caribbean: At World's End  
206647                                   Spectre  
49026                      The Dark Knight Rises  
49529                                John Carter  


In [103]:
#Veamos ahora que campos nos quedan en null
print(df_movies.isnull().sum())

budget                     0
genres                     0
homepage                3091
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
dtype: int64


In [104]:
# Eliminamos los registros con nulos de release_date y overview ya que son pocos
df_movies = df_movies.dropna(subset=['release_date', 'overview'])

# Tambien nos aseguramos que el campo release_date sea considerado como una fecha como tal
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'])

# Creamos una nueva columna con true o false, si la película posee o no página web
df_movies['website'] = df_movies['homepage'].notna()

# Eliminamos la columna homepage
del df_movies['homepage']

# Mantenemos la columna tagline de momento

print(df_movies.isnull().sum())

budget                    0
genres                    0
keywords                  0
original_language         0
original_title            0
overview                  0
popularity                0
production_companies      0
production_countries      0
release_date              0
revenue                   0
runtime                   0
spoken_languages          0
status                    0
tagline                 840
title                     0
vote_average              0
vote_count                0
cast                      0
crew                      0
website                   0
dtype: int64


In [105]:
# Nueva variable weight average o puntuación ponderada
# Si bien tenemos el voto promedio ya como dato (vote_average), no es lo mismo una pelicula con 1 voto y 9 de voto promedio que otra con 500 votos y un voto promedio de 8.5
v= df_movies['vote_count'] # Cantidad de votos
R=df_movies['vote_average'] # Voto Promedio
C=df_movies['vote_average'].mean()
m=df_movies['vote_count'].quantile(0.7)

df_movies['weighted_average'] = (R*v+C*m)/(v+m)

# Nueva variable profit o rentabilidad de cada película
df_movies['profit'] = df_movies['revenue'] - df_movies['budget']

# Nueva variable season o temporada, en base a la fecha de lanzamiento de la pelicula determinamos a que temporada pertenece (temporadas USA)
def get_season(date):
    if (date.month == 3 and date.day >= 20) or (date.month in [4, 5]) or (date.month == 6 and date.day < 21):
        return 'spring'
    elif (date.month == 6 and date.day >= 21) or (date.month in [7, 8]) or (date.month == 9 and date.day < 22):
        return 'summer'
    elif (date.month == 9 and date.day >= 22) or (date.month in [10, 11]) or (date.month == 12 and date.day < 21):
        return 'autumn'
    else:
        return 'winter'

# Aplicar la función a la columna 'release_date' para obtener la estación del año
df_movies['season'] = df_movies['release_date'].apply(get_season)

df_movies[['original_title','weighted_average','profit','season']].head()

Unnamed: 0_level_0,original_title,weighted_average,profit,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19995,Avatar,7.15,2550965087,autumn
285,Pirates of the Caribbean: At World's End,6.81,661000000,spring
206647,Spectre,6.28,635674609,autumn
49026,The Dark Knight Rises,7.51,834939099,summer
49529,John Carter,6.1,24139100,winter


In [106]:
#Veamos algunos datos generales de los campos númericos
print(df_movies.describe())

            budget  popularity                   release_date       revenue  \
count      4799.00     4799.00                           4799       4799.00   
mean   29065933.22       21.51  2002-12-25 03:47:26.843092480   82329203.47   
min           0.00        0.00            1916-09-04 00:00:00          0.00   
25%      800000.00        4.69            1999-07-11 12:00:00          0.00   
50%    15000000.00       12.93            2005-09-30 00:00:00   19184015.00   
75%    40000000.00       28.35            2011-02-14 00:00:00   92956519.00   
max   380000000.00      875.58            2017-02-03 00:00:00 2787965087.00   
std    40732511.64       31.82                            NaN  162907644.10   

       runtime  vote_average  vote_count  weighted_average        profit  
count  4799.00       4799.00     4799.00           4799.00       4799.00  
mean    106.90          6.09      690.79              6.22   53263270.25  
min       0.00          0.00        0.00              4.68 -165

In [107]:
#Veamos cuantos campos en cero tenemos
nun_missing = (df_movies[['budget','popularity','revenue','runtime','vote_average','vote_count','weighted_average','profit']] == 0).sum()
print(nun_missing)

budget              1036
popularity             0
revenue             1423
runtime               34
vote_average          61
vote_count            60
weighted_average       0
profit               891
dtype: int64


In [108]:
promedios = df_movies[['runtime','vote_average','vote_count']].mean()
print(promedios)

#Reemplacemos todos los campos en cero por NaN
df_movies[['budget','popularity','revenue','runtime','vote_average','vote_count','weighted_average']] = df_movies[['budget','popularity','revenue','runtime','vote_average','vote_count','weighted_average']].replace(0, np.nan)

df_movies[['runtime','vote_average','vote_count']] = df_movies[['runtime','vote_average','vote_count']].fillna(promedios)

nun_missing = (df_movies[['budget','popularity','revenue','runtime','vote_average','vote_count','weighted_average','profit']] == 0).sum()
print(nun_missing)

runtime        106.90
vote_average     6.09
vote_count     690.79
dtype: float64
budget                0
popularity            0
revenue               0
runtime               0
vote_average          0
vote_count            0
weighted_average      0
profit              891
dtype: int64


In [109]:
print(df_movies.isnull().sum())

budget                  1036
genres                     0
keywords                   0
original_language          0
original_title             0
overview                   0
popularity                 0
production_companies       0
production_countries       0
release_date               0
revenue                 1423
runtime                    0
spoken_languages           0
status                     0
tagline                  840
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
website                    0
weighted_average           0
profit                     0
season                     0
dtype: int64


In [110]:
# Pasemos los campos con datos en formato Json a lista
def convert(texto):
    lista=[]
    for i in ast.literal_eval(texto):
        lista.append(i['name'])
    return lista

df_movies['genres'] = df_movies['genres'].apply(convert)
df_movies['keywords'] = df_movies['keywords'].apply(convert)
df_movies['spoken_languages'] = df_movies['spoken_languages'].apply(convert)

def convert(texto):
    lista=[]
    contador=0
    for i in ast.literal_eval(texto):
        if contador<3:
            lista.append(i['name'])
        contador+=1
    return lista

# Nos quedamos solo con los 3 principales actores de la pelicula
df_movies['cast'] = df_movies['cast'].apply(convert)

def convert(texto):
    lista=[]
    contador=0
    for i in ast.literal_eval(texto):
        if contador<1:
            lista.append(i['name'])
        contador+=1
    return lista

# Filtramos para quedarnos con la principal compañia productora, principal pais y el director
df_movies['production_companies'] = df_movies['production_companies'].apply(convert)
df_movies['production_countries'] = df_movies['production_countries'].apply(convert)
df_movies['crew'] = df_movies['crew'].apply(convert)

df_movies.head()

Unnamed: 0_level_0,budget,genres,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,...,tagline,title,vote_average,vote_count,cast,crew,website,weighted_average,profit,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19995,237000000.0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.44,"[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]",2009-12-10,...,Enter the World of Pandora.,Avatar,7.2,11800.0,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Stephen E. Rivkin, Rick Carter, Christopher B...",True,7.15,2550965087,autumn
285,300000000.0,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.08,"[Walt Disney Pictures, Jerry Bruckheimer Films...",[United States of America],2007-05-19,...,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500.0,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[Dariusz Wolski, Gore Verbinski, Jerry Bruckhe...",True,6.81,661000000,spring
206647,245000000.0,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.38,"[Columbia Pictures, Danjaq, B24]","[United Kingdom, United States of America]",2015-10-26,...,A Plan No One Escapes,Spectre,6.3,4466.0,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[Thomas Newman, Sam Mendes, Anna Pinnock, John...",True,6.28,635674609,autumn
49026,250000000.0,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31,"[Legendary Pictures, Warner Bros., DC Entertai...",[United States of America],2012-07-16,...,The Legend Ends,The Dark Knight Rises,7.6,9106.0,"[Christian Bale, Michael Caine, Gary Oldman, A...","[Hans Zimmer, Charles Roven, Christopher Nolan...",True,7.51,834939099,summer
49529,260000000.0,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",en,John Carter,"John Carter is a war-weary, former military ca...",43.93,[Walt Disney Pictures],[United States of America],2012-03-07,...,"Lost in our world, found in another.",John Carter,6.1,2124.0,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[Andrew Stanton, Andrew Stanton, John Lasseter...",True,6.1,24139100,winter
