In [1]:
# 1) Importación de librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os, zipfile

In [2]:
# 2) Cargar dataset
df = pd.read_csv("CarPrice_Assignment.csv") 
print(df.shape)
df.head()

(205, 26)


Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [3]:
# 3) Clasificación de variables
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
binary_cols = [c for c in categorical_cols if df[c].nunique() == 2]
# ordinales por semántica
ordinal_candidates = []
if 'symboling' in df.columns: ordinal_candidates.append('symboling')
if 'doornumber' in df.columns: ordinal_candidates.append('doornumber')
if 'cylindernumber' in df.columns: ordinal_candidates.append('cylindernumber')

def var_type(col):
    if col in ordinal_candidates: return 'ordinal'
    if col in binary_cols: return 'binaria'
    if col in numeric_cols: return 'numérica'
    if col in categorical_cols: return 'categórica'
    return 'desconocida'

var_summary = pd.DataFrame({
    "variable": df.columns,
    "tipo": [var_type(c) for c in df.columns],
    "dtype": df.dtypes.astype(str).values,
    "n_unique": [df[c].nunique() for c in df.columns]
})
var_summary

Unnamed: 0,variable,tipo,dtype,n_unique
0,car_ID,numérica,int64,205
1,symboling,ordinal,int64,6
2,CarName,categórica,object,147
3,fueltype,binaria,object,2
4,aspiration,binaria,object,2
5,doornumber,ordinal,object,2
6,carbody,categórica,object,5
7,drivewheel,categórica,object,3
8,enginelocation,binaria,object,2
9,wheelbase,numérica,float64,53


In [4]:
# 4) Estadísticos descriptivos

desc_num = df[numeric_cols].describe().T
desc_num

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
car_ID,205.0,103.0,59.322565,1.0,52.0,103.0,154.0,205.0
symboling,205.0,0.834146,1.245307,-2.0,0.0,1.0,2.0,3.0
wheelbase,205.0,98.756585,6.021776,86.6,94.5,97.0,102.4,120.9
carlength,205.0,174.049268,12.337289,141.1,166.3,173.2,183.1,208.1
carwidth,205.0,65.907805,2.145204,60.3,64.1,65.5,66.9,72.3
carheight,205.0,53.724878,2.443522,47.8,52.0,54.1,55.5,59.8
curbweight,205.0,2555.565854,520.680204,1488.0,2145.0,2414.0,2935.0,4066.0
enginesize,205.0,126.907317,41.642693,61.0,97.0,120.0,141.0,326.0
boreratio,205.0,3.329756,0.270844,2.54,3.15,3.31,3.58,3.94
stroke,205.0,3.255415,0.313597,2.07,3.11,3.29,3.41,4.17


In [5]:
# 5) Frecuencia de categóricas
cat_stats = {}
for c in categorical_cols:
    cat_stats[c] = df[c].value_counts().head(10)

# Guardar en CSV
with open("top_categorias_por_variable.csv", "w", encoding="utf-8") as f:
    f.write("columna,categoria,frecuencia\n")
    for c, vc in cat_stats.items():
        for k, v in vc.items():
            f.write(f"{c},{k},{v}\n")
print("Archivo generado: top_categorias_por_variable.csv")

Archivo generado: top_categorias_por_variable.csv


In [6]:
# 6) Histogramas numéricas

os.makedirs("plots_hist", exist_ok=True)
for col in numeric_cols:
    plt.figure()
    df[col].hist(bins=20, edgecolor='black')
    plt.title(f"Histograma de {col}")
    plt.xlabel(col)
    plt.ylabel("Frecuencia")
    plt.savefig(f"plots_hist/hist_{col}.png")
    plt.close()

In [7]:
# 7) Matriz de correlación

corr = df[numeric_cols].corr()
plt.figure(figsize=(10,8))
im = plt.imshow(corr, cmap="coolwarm", aspect="auto")
plt.colorbar(im)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Matriz de correlación")
plt.tight_layout()
plt.savefig("correlation_matrix.png")
plt.close()

corr['price'].sort_values(ascending=False).to_csv("correlaciones_con_price.csv")
print("Archivos generados: correlation_matrix.png, correlaciones_con_price.csv")


Archivos generados: correlation_matrix.png, correlaciones_con_price.csv


In [8]:
# 8) Relación numéricas vs price (scatterplots)

target = 'price'
top_corrs = corr[target].drop(target).abs().sort_values(ascending=False).head(6).index

os.makedirs("plots_scatter", exist_ok=True)
for col in top_corrs:
    plt.figure()
    plt.scatter(df[col], df[target])
    plt.xlabel(col)
    plt.ylabel("price")
    plt.title(f"{col} vs price")
    plt.savefig(f"plots_scatter/scatter_{col}_vs_price.png")
    plt.close()


In [9]:
# 9) Relación categóricas vs price (boxplots y medianas)

os.makedirs("plots_box", exist_ok=True)
categoricas_sel = ['fueltype','aspiration','carbody','drivewheel','enginetype','cylindernumber','doornumber','enginelocation']
summary_by_cat = {}

for c in categoricas_sel:
    if c in df.columns:
        # Boxplot
        df.boxplot(column='price', by=c, grid=False)
        plt.title(f"Distribución de price por {c}")
        plt.suptitle("")
        plt.xticks(rotation=45)
        plt.savefig(f"plots_box/box_{c}_vs_price.png")
        plt.close()
        
        # Medianas
        summary_by_cat[c] = df.groupby(c)['price'].median()

with open("median_price_by_category.csv","w",encoding="utf-8") as f:
    f.write("columna,categoria,median_price\n")
    for col, ser in summary_by_cat.items():
        for cat, val in ser.items():
            f.write(f"{col},{cat},{val}\n")
print("Archivo generado: median_price_by_category.csv")


Archivo generado: median_price_by_category.csv


In [10]:
# 10) Empaquetar todos los plots

with zipfile.ZipFile("EDA_plots.zip", "w", zipfile.ZIP_DEFLATED) as zf:
    for folder in ["plots_hist", "plots_scatter", "plots_box"]:
        for root, dirs, files in os.walk(folder):
            for file in files:
                full = os.path.join(root, file)
                zf.write(full)
print("Archivo generado: EDA_plots.zip")


Archivo generado: EDA_plots.zip
