In [None]:
import pandas as pd
import numpy as np
import os
!pip install --gdown --quiet
import gdown
import zipfile
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# 1. Descarga y extracción de datos
file_id = '1CcnzFSViob2EJ9beSazDBCw29WgIm1_u'
output = 'data.zip'
folder = 'data'
nombre_archivo = 'data.csv'

try:
    print("Descargando archivo zip desde Google Drive...")
    gdown.download(f"https://drive.google.com/uc?id={file_id}", output, quiet=False)

    print(f"Extrayendo archivos a {folder}...")
    os.makedirs(folder, exist_ok=True)
    with zipfile.ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall(folder)

    csv_path= os.path.join(folder, nombre_archivo)
    print(f"Cargando dataset desde {csv_path}...")
    msrp_df = pd.read_csv(csv_path)

except Exception as e:
    print(f"Error: {str(e)}")
    raise



In [23]:
# 2. Análisis de datos
def analyze_data(msrp_df):
    # Análisis básico
    print("\n=== Información del DataFrame ===")
    msrp_df.info()

    print("\n=== Estadísticas descriptivas ===")
    print(msrp_df.describe())

    print("\n=== Valores faltantes ===")
    missing_values = msrp_df.isnull().sum()
    print(missing_values)

    total_cells = np.prod(msrp_df.shape)
    total_missing = missing_values.sum()
    print(f"\nPorcentaje de valores faltantes: {(total_missing/total_cells)*100:.2f}%")

    return msrp_df

In [None]:
# 3. Limpieza de datos
def clean_data(msrp_df):
    # Imputación de valores nulos
    msrp_df['Engine HP'] = msrp_df['Engine HP'].fillna(msrp_df['Engine HP'].median())
    msrp_df['Engine Cylinders'] = msrp_df['Engine Cylinders'].fillna(msrp_df['Engine Cylinders'].mode()[0])
    msrp_df['Number of Doors'] = msrp_df['Number of Doors'].fillna(msrp_df['Number of Doors'].mode()[0])
    msrp_df['Market Category'] = msrp_df['Market Category'].fillna('Unknown')

    msrp_df['Engine Fuel Type'] = msrp_df.groupby('Make')['Engine Fuel Type'].transform(
        lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x
    )

    return msrp_df

In [25]:
# 4. Visualización (guardará imágenes en lugar de mostrarlas)
def generate_visualizations(msrp_df):
    os.makedirs("output", exist_ok=True)

    # Valores únicos para columnas categóricas
    object_columns = msrp_df.select_dtypes(include=['object']).columns
    with open("output/unique_values.txt", "w") as f:
        for col in object_columns:
            unique_values = msrp_df[col].unique()
            unique_values.sort()
            f.write(f"Columna: {col}\n")
            f.write(f"{unique_values}\n")
            f.write("-"*30 + "\n")

In [27]:
# This code has been moved to the main execution block in cell lF0gFKq9qS1G

In [None]:
# Ejecución principal
if __name__ == "__main__":
    # 1. Obtener datos
 #   csv_path = download_and_extract_data()
#    msrp_df = pd.read_csv(csv_path)

    # 2. Análisis inicial
    print("\n=== Primeras filas del dataset ===")
    print(msrp_df.head())

    msrp_df = analyze_data(msrp_df)

    # 3. Limpieza
    msrp_df = clean_data(msrp_df)

    # 4. Guardar datos limpios
    os.makedirs("data", exist_ok=True)
    clean_path = "data/msrp_clean.csv"
    msrp_df.to_csv(clean_path, index=False)
    print(f"\nDatos limpios guardados en: {clean_path}")

    # 5. Generar visualizaciones
    generate_visualizations(msrp_df)

    # Boxplots para columnas numéricas
    df_num = msrp_df.drop(columns=['MSRP','Year'], errors='ignore')
    num_cols = df_num.select_dtypes(include=['int64', 'float64']).columns
    filtered_cols = [col for col in num_cols if df_num[col].max() > 1]

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df_num[filtered_cols])
    scaled_df = pd.DataFrame(scaled_data, columns=filtered_cols)

    for col in filtered_cols:
       plt.figure(figsize=(8, 6))
       scaled_df.boxplot(column=col)
       plt.title(f'Boxplot de {col} (escalado 0-1)')
       plt.ylabel('Valor escalado')
       plt.tight_layout()
       plt.savefig(f"output/boxplot_{col}.png")
       plt.close()

    print("\nAnálisis completado. Resultados guardados en /output")

Descargando archivo zip desde Google Drive...


Downloading...
From: https://drive.google.com/uc?id=1CcnzFSViob2EJ9beSazDBCw29WgIm1_u
To: /content/data.zip
100%|██████████| 115k/115k [00:00<00:00, 3.81MB/s]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  msrp_df['Engine HP'].fillna(msrp_df['Engine HP'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  

Extrayendo archivos a data...
Dataset loaded from data/data.csv...

=== Primeras filas del dataset ===
  Make       Model  Year             Engine Fuel Type  Engine HP  \
0  BMW  1 Series M  2011  premium unleaded (required)      335.0   
1  BMW    1 Series  2011  premium unleaded (required)      300.0   
2  BMW    1 Series  2011  premium unleaded (required)      300.0   
3  BMW    1 Series  2011  premium unleaded (required)      230.0   
4  BMW    1 Series  2011  premium unleaded (required)      230.0   

   Engine Cylinders Transmission Type     Driven_Wheels  Number of Doors  \
0               6.0            MANUAL  rear wheel drive              2.0   
1               6.0            MANUAL  rear wheel drive              2.0   
2               6.0            MANUAL  rear wheel drive              2.0   
3               6.0            MANUAL  rear wheel drive              2.0   
4               6.0            MANUAL  rear wheel drive              2.0   

                         Market

In [29]:
# 3. Limpieza
msrp_df = clean_data(msrp_df)

# 4. Guardar datos limpios
os.makedirs("data", exist_ok=True)
clean_path = "data/msrp_clean.csv"
msrp_df.to_csv(clean_path, index=False)
print(f"\nDatos limpios guardados en: {clean_path}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  msrp_df['Engine HP'].fillna(msrp_df['Engine HP'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  msrp_df['Engine Cylinders'].fillna(msrp_df['Engine Cylinders'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never 


Datos limpios guardados en: data/msrp_clean.csv


In [30]:
# 5. Generar visualizaciones
generate_visualizations(msrp_df)
print("\nAnálisis completado. Resultados guardados en /output")


Análisis completado. Resultados guardados en /output
