Imports Ej 4 práctico 2

In [None]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from pandas.plotting import scatter_matrix
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Ejercicio 4 práctica 2, auto-mpg dataset

In [None]:
carsConsume = pd.read_csv(Path("Datasets/auto-mpg[1].csv"))
carsConsume.hist(bins=50, figsize=(12,8))
plt.show()
print(carsConsume.head())
print(carsConsume.columns)

# Reemplazar "?" por NaN y convertir horsepower a numérico
carsConsume["horsepower"].replace("?", np.nan, inplace=True)
carsConsume["horsepower"] = carsConsume["horsepower"].astype(float)

print(carsConsume.info())

y = carsConsume["mpg"]
X = carsConsume.drop(columns=["mpg", "car name"], errors='ignore')

train_carsConsume, test_carsConsume = train_test_split(carsConsume,test_size=0.2, stratify=carsConsume["cylinders"], random_state=42)

X_train = train_carsConsume.drop(columns=["mpg", "car name"], errors='ignore')
y_train = train_carsConsume["mpg"]

X_test = test_carsConsume.drop(columns=["mpg", "car name"], errors='ignore')
y_test = test_carsConsume["mpg"]

carsConsume.plot(kind="scatter", x="weight", y="horsepower", grid=True, figsize=(8, 6))
plt.title("Relación entre peso y potencia")
plt.show()

attributes = ["mpg", "horsepower", "weight", "displacement", "acceleration"]
scatter_matrix(carsConsume[attributes], figsize=(12, 8), alpha=0.7, diagonal='hist')
plt.suptitle("Matriz de dispersión — Dataset Auto MPG", fontsize=16)
plt.show()

# Relación potencia/peso — eficiencia del motor
carsConsume["power_weight_ratio"] = carsConsume["horsepower"] / carsConsume["weight"]

# Relación cilindrada/peso — mide "grandeza del motor relativo al tamaño del auto"
carsConsume["displacement_per_weight"] = carsConsume["displacement"] / carsConsume["weight"]

# Relación aceleración/potencia — qué tan rápido acelera por caballo de fuerza
carsConsume["acceleration_per_hp"] = carsConsume["acceleration"] / carsConsume["horsepower"]

corr_matrix = carsConsume.corr(numeric_only=True)

# Ordenar correlaciones con respecto a 'mpg'
print(corr_matrix["mpg"].sort_values(ascending=False))

median_hp = carsConsume["horsepower"].median()
carsConsume["horsepower"].fillna(median_hp, inplace=True)

num_attribs = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model year"]
cat_attribs = ["origin"]

num_pipeline = make_pipeline(SimpleImputer(strategy="median"),StandardScaler())

cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([("num", num_pipeline, num_attribs),("cat", cat_pipeline, cat_attribs)])

set_config(display='diagram')  # Muestra el pipeline visualmente
carsConsume_prepared = preprocessing.fit_transform(carsConsume)

print("Shape del dataset procesado:", carsConsume_prepared.shape)

X_train_prepared = preprocessing.fit_transform(X_train)
X_test_prepared = preprocessing.transform(X_test)

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)
y_pred = lin_reg.predict(X_test_prepared)

print(f"R²: {r2_score(y_test, y_pred):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

carsConsume_predictions = lin_reg.predict(X_test_prepared)
print("Primeras 5 predicciones:", carsConsume_predictions[:5].round(1))
print("Valores reales:", y_test.iloc[:5].values)

lin_mse = mean_squared_error(y_test, carsConsume_predictions)
lin_rmse = np.sqrt(lin_mse)
print("Linear Regression RMSE:", lin_rmse)

Imports Ej 7 práctica 4

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    ConfusionMatrixDisplay, f1_score, precision_score, recall_score
)
from sklearn.preprocessing import OneHotEncoder

Ej 7 Práctica 4

In [None]:
ST_vs_MW = pd.read_csv('Datasets/ScreenTime vs MentalWellness.csv')

def categorize_wellness(score):
    if score <= 33:
        return 'Bajo'
    elif score <= 66:
        return 'Medio'
    else:
        return 'Alto'

ST_vs_MW['wellness_class'] = ST_vs_MW['mental_wellness_index_0_100'].apply(categorize_wellness)

ST_vs_MW = ST_vs_MW.drop(columns=['user_id', 'mental_wellness_index_0_100'])

categorical_cols = ['gender', 'occupation', 'work_mode']
numerical_cols = [col for col in ST_vs_MW.columns if col not in categorical_cols + ['wellness_class']]

encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cats = encoder.fit_transform(ST_vs_MW[categorical_cols])
encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

X = pd.concat([ST_vs_MW[numerical_cols].reset_index(drop=True), encoded_df], axis=1)
y = ST_vs_MW['wellness_class'].astype(str).values  # Asegura tipo string

# Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Training
ST_vs_MW_DT = DecisionTreeClassifier(random_state=42, max_depth=10)
ST_vs_MW_DT.fit(X_train, y_train)

ST_vs_MW_RF = RandomForestClassifier(n_estimators=100, random_state=42)
ST_vs_MW_RF.fit(X_train, y_train)

# Predicts
y_pred_dt = ST_vs_MW_DT.predict(X_test)
y_pred_rf = ST_vs_MW_RF.predict(X_test)

# Matrices de confusión
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt, labels=['Bajo', 'Medio', 'Alto'])
disp_dt = ConfusionMatrixDisplay(confusion_matrix=cm_dt, display_labels=['Bajo', 'Medio', 'Alto'])
disp_dt.plot(ax=axes[0], cmap='Blues', colorbar=False)
axes[0].set_title('Matriz de Confusión - Decision Tree')

# Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=['Bajo', 'Medio', 'Alto'])
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=['Bajo', 'Medio', 'Alto'])
disp_rf.plot(ax=axes[1], cmap='Greens', colorbar=False)
axes[1].set_title('Matriz de Confusión - Random Forest')

plt.tight_layout()
plt.show()

# Métricas numéricas
print("=== Métricas de Clasificación ===")
for name, y_pred in [("Decision Tree", y_pred_dt), ("Random Forest", y_pred_rf)]:
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    print(f"\n{name}:")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall: {rec:.4f}")

# Comparación de métricas
metrics = ['Accuracy', 'F1-score', 'Precision', 'Recall']
dt_vals = [
    accuracy_score(y_test, y_pred_dt),
    f1_score(y_test, y_pred_dt, average='weighted'),
    precision_score(y_test, y_pred_dt, average='weighted'),
    recall_score(y_test, y_pred_dt, average='weighted')
]
rf_vals = [
    accuracy_score(y_test, y_pred_rf),
    f1_score(y_test, y_pred_rf, average='weighted'),
    precision_score(y_test, y_pred_rf, average='weighted'),
    recall_score(y_test, y_pred_rf, average='weighted')
]

x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(10, 6))
plt.bar(x - width/2, dt_vals, width, label='Decision Tree', color='lightsteelblue', edgecolor='black')
plt.bar(x + width/2, rf_vals, width, label='Random Forest', color='lightgreen', edgecolor='black')
plt.xticks(x, metrics)
plt.ylabel('Puntuación')
plt.title('Comparación de Métricas de Clasificación')
plt.ylim(0, 1)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Importancia de características (solo Random Forest)
importances = ST_vs_MW_RF.feature_importances_
indices = np.argsort(importances)[::-1][:10]

plt.figure(figsize=(8, 6))
plt.barh(range(10), importances[indices], color='seagreen', edgecolor='black')
plt.yticks(range(10), [X.columns[i] for i in indices])
plt.gca().invert_yaxis()
plt.title('Top 10: Importancia de Características (Random Forest)')
plt.xlabel('Importancia')
plt.tight_layout()
plt.show()