In [None]:
# Importar librerías 
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 50)

In [None]:
# Importar la base de datos a utilizar (movies_dataset.csv) y leer los datos
df = pd.read_csv('Dataset/movies_dataset.csv')

In [None]:
df.shape # Ver cuantos renglones y columnas hay en df

In [None]:
df.head(5) # Ver las 5 primeros renglones del df

In [None]:
df.columns

In [None]:
# Seleccionar solo las columnas que son importantes para el modelo
df = df[[#'adult', 'belongs_to_collection', 'budget', 
         #'genres',
#'homepage', 'id',
#       'imdb_id', 'original_language', 'original_title', 
    'overview',
    #'popularity', 
#'poster_path', 'production_companies',
#       'production_countries', 'release_date', 'revenue', 'runtime',
#       'spoken_languages', 'status', 'tagline', 
'title'
#'video',
#'vote_average'
# 'vote_count'
]] # Ver las columnas del df

In [None]:
df.dtypes # Ver el tipo de dato de cada columna

In [None]:
df.describe() # Ver la estadística descriptiva del df

In [None]:
# Desanidar la columna genres que está en forma de lista y dentro de ella { diccionarios}
df2 = pd.json_normalize(df['genres'].map(eval))

In [None]:
df2.head()

In [None]:
Id = []
Names = []
for i in range(8):
    nid = 'id'+ str(i)
    nam = 'name' + str(i)
    Id.append(nid)
    Names.append(nam)
    df2[nid] = pd.json_normalize(df2[i])['id']
    df2[nam] = pd.json_normalize(df2[i])['name']

In [None]:
col = list(range(8)) # Se crea una variable con el rango de la lista

In [None]:
df2.drop(columns=col, inplace=True) # Dropear ya que se desanidaron

In [None]:
df2['id_genres'] = df2[Id].apply(
    lambda x: ',' .join(x.dropna().astype(str)),
    axis = 1)

In [None]:
df2['name_genres'] = df2[Names].apply(
    lambda x: ',' .join(x.dropna().astype(str)),
    axis = 1)

In [None]:
df2

In [None]:
df2['id_genres'].replace('', np.nan, inplace=True) # Reemplaza cualquier cadena vacía en la columna 'id_genres' del DataFrame 'df2' con el valor NaN 
df2['name_genres'].replace('', np.nan, inplace=True)

In [None]:
df2

In [None]:
df2.drop(columns = Id, inplace = True)
df2.drop(columns = Names, inplace = True)
df2

In [None]:
df2.isna().sum()

In [None]:
# Se concatenan las columnas nuevas al df original
df_final = pd.concat([df, df2], axis=1)

In [None]:
df_final

In [None]:
# Se eliminan la columna genres que ya fue desanidada
df_final.drop(columns=['genres', 'id_genres'], inplace=True)

In [None]:
df_final.dtypes

In [None]:
df_final['popularity'] = pd.to_numeric(df_final['popularity'], errors='coerce')


In [None]:
# Verificar si hay valores nulos
df_final.isna().sum()

In [None]:
# Eliminar valores duplicados
# df.loc[df.duplicated()]
df_final.duplicated().sum()

In [None]:
df.duplicated().sum()

In [None]:
df_final = df_final.drop_duplicates().reset_index(drop=True)


In [None]:
df = df.drop_duplicates().reset_index(drop=True)


In [None]:
df_final

In [None]:
df

In [None]:
# Distribución de características
# Serie que indica la frecuencia de ocurrencia de cada título de película en el DataFrame
ax = df_final['title'].value_counts().head(10).plot(kind = 'bar', title="Top 10 Movie Title")
ax.set_xlabel("Movie Title")
ax.set_ylabel("Count")

In [None]:
ax = df_final['popularity'].value_counts().head(10).plot(kind = 'bar', title='Popularity')
ax.set_xlabel('Popularity')
ax.set_ylabel("Count")

In [None]:
ax = df_final['vote_average'].value_counts().head(10).plot(kind = 'bar', title='Vote Average')
ax.set_xlabel('Vote Average')
ax.set_ylabel("Count")

In [None]:
ax = df_final['name_genres'].value_counts().head(10).plot(kind = 'bar', title='Name Genres')
ax.set_xlabel('Name Genres')
ax.set_ylabel("Count")

In [None]:
# Relación de características
df_final.plot(kind = 'scatter', 
              x = 'vote_average',
              y = 'popularity',
              title = 'vote_average VS popularity')
plt.show()

In [None]:
sns.scatterplot(x = 'vote_average',
              y = 'popularity',
              data=df_final)

In [None]:
df_final[['popularity', 'vote_average']].dropna().corr() # Ver la correlación netre variables

In [None]:
# Obtener el conteo de películas por popularidad
popularity_counts = df_final['popularity'].value_counts()
# Ordenar por popularidad
popularity_counts = popularity_counts.sort_index()

# Graficar
plt.plot(popularity_counts.index, popularity_counts.values)
plt.xlabel('Popularity')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
# Guardar el df_final en un archivo csv llamado 
#df_final.to_csv('Dataset\movies_ML.csv', index=False)

In [None]:
#df = df.sample(n=9000)  # Toma una muestra aleatoria de 10,000 registros
df = df.head(9000)

# Reindexar el DataFrame de muestra
df = df.reset_index(drop=True)
df

In [None]:
df.to_csv('Dataset\movies_ML.csv', index=False)