# Regresión múltiple

Ejemplo de pipeline de preprocesado y regresión.

Objetivo: predecir price que significa precio de vivienda.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../../data/duke-forest-nulls.csv')
df.tail(1)

Unnamed: 0,address,price,bed,bath,area,type,year_built,heating,cooling,parking,lot,hoa,url
97,"2708 Circle Dr, Durham, NC 27705",674500,4,4.0,3766.0,Single Family,1955.0,"Forced air, Electric, Gas",other,0 spaces,0.73,,https://www.zillow.com/homedetails/2708-Circle...


## Valores faltantes con Scikit Learn

Técnicas de imputación de valores faltantes: sklearn.impute

### Ejemplo demo

In [3]:
from sklearn.experimental import enable_iterative_imputer
# CUIDADO: enable_iterative_imputer va antes de IterativeImputer para poder usarlo
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

df = pd.DataFrame({
    'edad': [25, np.nan, 40, 35, 60, np.nan],
    'genero': ['masculino', 'femenino', np.nan, 'masculino', 'masculino', 'femenino'],
    'ingresos': [30000, 70000, np.nan, 80000, np.nan, 40000]
})

# Opción 1 SimpleImputer por media o mediana:
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# df['edad'] = imp_mean.fit_transform(df[['edad']])

# Opción 2 KNNImputer: media pero de las filas más cercanas según el algoritmo de KNN
# imp_knn = KNNImputer(n_neighbors=2) # calcula la media de los n_neighbors más cercanos
# df['edad'] = imp_knn.fit_transform(df[['edad']])
# df['ingresos'] = imp_knn.fit_transform(df[['ingresos']])

# Opción 3 SimpleImputer constante: asignamos un valor fijo
# imp_constant = SimpleImputer(strategy='constant', fill_value='other')
# df['genero'] = imp_constant.fit_transform(df[['genero']])

# Opción 4 SimpleImputer con el valor más frecuente (moda) 
# imp_mode = SimpleImputer(strategy='most_frequent')
# df['genero'] = imp_mode.fit_transform(df[['genero']])

# Opción 5 IterativeImputer con una predicción utilizando una regresión
imp_iter = IterativeImputer(random_state=42)
df['ingresos'] = imp_iter.fit_transform(df[['ingresos']])

df.head()

Unnamed: 0,edad,genero,ingresos
0,25.0,masculino,30000.0
1,,femenino,70000.0
2,40.0,,55000.0
3,35.0,masculino,80000.0
4,60.0,masculino,55000.0


### Ejemplo duke forest

In [4]:
df = pd.read_csv('../../data/duke-forest-nulls.csv')

df.isnull().sum()

address        0
price          0
bed            0
bath           0
area           2
type           2
year_built     4
heating        1
cooling        1
parking        0
lot            1
hoa           97
url            0
dtype: int64

In [5]:
df.head(2)

Unnamed: 0,address,price,bed,bath,area,type,year_built,heating,cooling,parking,lot,hoa,url
0,"1 Learned Pl, Durham, NC 27705",1520000,3,4.0,6040.0,Single Family,1972.0,"Other, Gas",central,0 spaces,0.97,,https://www.zillow.com/homedetails/1-Learned-P...
1,"1616 Pinecrest Rd, Durham, NC 27705",1030000,5,4.0,4475.0,Single Family,1969.0,"Forced air, Gas",central,"Carport, Covered",1.38,,https://www.zillow.com/homedetails/1616-Pinecr...


In [6]:
df['type'].value_counts()

Single Family    96
Name: type, dtype: int64

In [7]:
df['parking'].value_counts()

0 spaces                                                     42
Carport, Covered                                             11
Garage - Attached, Covered                                   10
Garage - Attached                                            10
Covered                                                       4
Carport, Garage - Attached, Covered                           3
Off-street, Covered                                           2
Garage, Garage - Detached, Covered                            2
Garage - Attached, Garage - Detached, Covered                 2
Garage, Carport, Covered                                      2
Garage                                                        2
Garage - Detached, Off-street, Covered                        1
Garage, Garage - Detached, Off-street, Covered                1
Garage, Garage - Detached, Off-street, On-street, Covered     1
Off-street                                                    1
Garage, Garage - Detached, Off-street   

In [8]:
# parking se puede procesar con pandas e intentar sacar el número de plazas
df = df.drop(['address', 'hoa', 'url', 'type', 'parking'], axis=1)
df.head(2)

Unnamed: 0,price,bed,bath,area,year_built,heating,cooling,lot
0,1520000,3,4.0,6040.0,1972.0,"Other, Gas",central,0.97
1,1030000,5,4.0,4475.0,1969.0,"Forced air, Gas",central,1.38


In [9]:
df.isnull().sum()

price         0
bed           0
bath          0
area          2
year_built    4
heating       1
cooling       1
lot           1
dtype: int64

In [10]:
numeric_cols  = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include='object').columns
print(categorical_cols)
print(numeric_cols)

# IterativeImputer a numéricas

df_numeric = imp_iter = IterativeImputer(random_state=42).fit_transform(df[numeric_cols])

# SimpleImputer most_frequent a categóricas
df_categorical = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_cols])

df.isnull().sum()



Index(['heating', 'cooling'], dtype='object')
Index(['price', 'bed', 'bath', 'area', 'year_built', 'lot'], dtype='object')


price         0
bed           0
bath          0
area          2
year_built    4
heating       1
cooling       1
lot           1
dtype: int64

In [11]:
df.isnull().sum()

price         0
bed           0
bath          0
area          2
year_built    4
heating       1
cooling       1
lot           1
dtype: int64

In [12]:
# categóricos: most_frequent, constant
imp_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # moda
df['type'] = imp_most_freq.fit_transform(df[['type']])

KeyError: "None of [Index(['type'], dtype='object')] are in the [columns]"

In [None]:
df.isnull().sum()

In [None]:
df.head()

EDA
pairplot
heatmap
scatterplot

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(10, 6))
df_corr = df.corr(numeric_only=True).round(2)
sns.heatmap(df_corr, cmap='viridis', annot=True);

In [None]:

sns.scatterplot(data=df, x='area', y='price', hue='cooling')

In [None]:
sns.scatterplot(data=df, x='area', y='price', hue='bath', palette='coolwarm')

## Codificacion categorico a numerico

In [15]:
from sklearn.preprocessing import OneHotEncoder

# equivalente a get_dummies de pandas

df['heating'].value_counts()

Forced air, Gas                         33
Forced air, Electric, Gas               14
Other                                   12
Other, Gas                               7
Other, Electric, Gas                     6
Heat pump, Gas                           3
Forced air, Heat pump, Gas               3
Forced air, Heat pump, Electric          3
Heat pump, Electric                      2
Forced air, Heat pump, Electric, Gas     2
Forced air, Electric                     2
No Data                                  2
Heat pump, Electric, Gas                 2
Heat pump, Other, Electric, Gas          1
Other, Radiant, Gas                      1
Forced air, Gas, Wood / Pellet           1
Forced air, Other                        1
Forced air                               1
Baseboard, Heat pump, Gas                1
Name: heating, dtype: int64

In [22]:
df['heating_int'] = df['heating'].str.split(',').apply(len)

TypeError: object of type 'float' has no len()

In [23]:
e   qns.scatterplot(data=df, x= 'area', y='price', hue='heatting_int', palette='coolwarm')

ValueError: Could not interpret value `heatting_int` for parameter `hue`

In [28]:
df['heating'].value_counts()

Forced air, Gas                         33
Forced air, Electric, Gas               14
Other                                   12
Other, Gas                               7
Other, Electric, Gas                     6
Heat pump, Gas                           3
Forced air, Heat pump, Gas               3
Forced air, Heat pump, Electric          3
Heat pump, Electric                      2
Forced air, Heat pump, Electric, Gas     2
Forced air, Electric                     2
No Data                                  2
Heat pump, Electric, Gas                 2
Heat pump, Other, Electric, Gas          1
Other, Radiant, Gas                      1
Forced air, Gas, Wood / Pellet           1
Forced air, Other                        1
Forced air                               1
Baseboard, Heat pump, Gas                1
Name: heating, dtype: int64

In [42]:

def classify_heating(heating):

 #  if 'No Data' in heating:
 #       return 'No Data'
 #   elif 'Other' == heating:
 #       return 'Other'
 #   elif ',' in heating:
 #       return 'Mixed'
 #   else:
 #       return 'Other'
    
    return heating.split( ',')[0]
    
df['heating_int'] = df['heating'].apply(classify_heating)
df['heating_int'].value_counts()

AttributeError: 'float' object has no attribute 'split'

In [45]:
sns.scatterplot(data=df, x ='area', y ='precio', hue='classify_heating')

ValueError: Could not interpret value `precio` for parameter `y`

In [46]:
heating_types = ['Force air', 'Gas', 'Electric', 'Other', ' Head Pump', 'No Data', 'Radiant', ' Wood/Pellet', 'Baseboard']


## Preparar Dataframe de resultados

In [None]:

dhuñie