# Procesamiento de Datos

##### Importación de Librerías y Carga del Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Ruta del archivo
data_path = "../../data/movies.csv"
clean_data_path = "../../data/movies_clean.csv"

# Cargar dataset
df = pd.read_csv(data_path, encoding="ISO-8859-1")

# Convertir 'releaseDate' a formato de fecha
df["releaseDate"] = pd.to_datetime(df["releaseDate"], errors="coerce")

# Mostrar las primeras filas
df.head()

##### Revisión General del Dataset

In [None]:
# Información general del dataset
df.info()

# Revisar valores nulos
df.isnull().sum()

##### Clasificación Automática de Variables

In [None]:
classification = {}

for column in df.columns:
    dtype = df[column].dtype  
    if dtype == "object":
        classification[column] = "Cualitativa Nominal"
    elif dtype == "int64":
        classification[column] = "Cuantitativa Discreta"
    elif dtype == "float64":
        classification[column] = "Cuantitativa Continua"
    elif "datetime" in str(dtype):
        classification[column] = "Cualitativa Nominal"

# Correcciones manuales
continuous_vars = ["budget", "revenue", "runtime", "popularity", "voteAvg", "actorsPopularity"]
discrete_vars = ["castWomenAmount", "castMenAmount"]

for var in continuous_vars:
    classification[var] = "Cuantitativa Continua"

for var in discrete_vars:
    classification[var] = "Cuantitativa Discreta"

# Mostrar clasificación
pd.DataFrame(list(classification.items()), columns=["Variable", "Tipo"])

##### Conversión de Variables Numéricas y Categóricas

In [None]:
for var in continuous_vars:
    df[var] = pd.to_numeric(df[var], errors='coerce')

# Convertir variables categóricas en minúsculas para evitar inconsistencias
df["genres"] = df["genres"].str.lower()
df["productionCompany"] = df["productionCompany"].str.lower()
df["productionCountry"] = df["productionCountry"].str.lower()
df["originalLanguage"] = df["originalLanguage"].str.lower()

##### Visualización de Distribución de Variables Numéricas

In [None]:
plt.figure(figsize=(10, 5))

for var in continuous_vars:
    sns.histplot(df[var].dropna(), kde=True, bins=30)
    plt.title(f"Distribución de {var}")
    plt.xlabel(var)
    plt.ylabel("Frecuencia")
    plt.show()

##### Pruebas de Normalidad (Shapiro-Wilk y Kolmogorov-Smirnov)

In [None]:
normality_results = []

for var in continuous_vars:
    data = df[var].dropna()
    shapiro_test = stats.shapiro(data) if len(data) < 5000 else (None, None)
    ks_test = stats.kstest(data, 'norm')

    normality_results.append({
        "Variable": var,
        "Shapiro-Wilk p-valor": f"{shapiro_test[1]:.6f}" if shapiro_test[1] is not None else "N/A",
        "Kolmogorov-Smirnov p-valor": ks_test.pvalue
    })

pd.DataFrame(normality_results)

##### Tablas de Frecuencias de Variables Cualitativas

In [None]:
qualitative_vars = ["genres", "productionCompany", "productionCountry", "originalLanguage"]

for var in qualitative_vars:
    print(f"\n🔹 {var}:")
    print(df[var].value_counts().head(10))