00  -   INICIO - Limpieza y Seteado inicial

In [1]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np
# Gráficos
# ==============================================================================
import seaborn as sns
import matplotlib.pyplot as plt
# Gráfico de distribución para cada variable numérica
import matplotlib.ticker as ticker
# ==============================================================================
# Identificador de tipos de Distribucion
# ==============================================================================
from fitter import Fitter, get_common_distributions
# ==============================================================================
url = "C:/Users/and14/Desktop/Proyectos/Programacion/Python/2022/Practica_20220831_ML/00_Data_Sets/SaratogaHouses.csv"

datos = pd.read_csv(url, sep=",")

# Se renombran las columnas para que sean más descriptivas
datos.columns = ["precio", "metros_totales", "antiguedad", "precio_terreno",
                 "metros_habitables", "universitarios", "dormitorios", 
                 "chimenea", "banyos", "habitaciones", "calefaccion",
                 "consumo_calefacion", "desague", "vistas_lago",
                 "nueva_construccion", "aire_acondicionado"]
#Limpiar datos
datos = datos.fillna(0)

# Se convierte la variable chimenea tipo string
# ==============================================================================
datos.chimenea = datos.chimenea.astype("str")

def tidy_corr_matrix(corr_mat):
    '''
    Función para convertir una matrix de correlación de pandas en formato tidy
    '''
    corr_mat = corr_mat.stack().reset_index()
    corr_mat.columns = ['variable_1','variable_2','r']
    corr_mat = corr_mat.loc[corr_mat['variable_1'] != corr_mat['variable_2'], :]
    corr_mat['abs_r'] = np.abs(corr_mat['r'])
    corr_mat = corr_mat.sort_values('abs_r', ascending=False)
    return(corr_mat)
    
corr_matrix = datos.select_dtypes(include=['float64', 'int']).corr(method='pearson')
dic_replace = {'2': "2_mas",
               '3': "2_mas",
               '4': "2_mas"}

datos['chimenea'] = datos['chimenea'] \
                     .map(dic_replace) \
                     .fillna(datos['chimenea'])

01  -   Regresión Lineal

01.01       Llamada de libreria
            En conclusión, la 
            Regresión Lineal:
                Es un algoritmo de regresión por lo que la utilizamos para predecir un valor numérico, mientras que la 
            Regresión Logística:
                Es un algoritmo de clasificación por lo que la utilizamos para predecir entre dos opciones.

In [2]:
from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression()



01.02       Seteado de Datos

In [3]:
dic_replace = {'Yes': 1,
               'No': 0}
datos['aire_acondicionado'] = datos['aire_acondicionado'] \
                     .map(dic_replace) \
                     .fillna(datos['aire_acondicionado'])

dic_replace = {'none': 'septic'}
datos['desague'] = datos['desague'] \
                     .map(dic_replace) \
                     .fillna(datos['desague'])

datos_xx = datos.select_dtypes(include=['float64', 'int'])
datos_yy =datos[['precio']]

In [4]:
datos.desague.value_counts()

public/commercial    1213
septic                515
Name: desague, dtype: int64

In [5]:
datos_xx.info()
datos_yy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   precio              1728 non-null   int64  
 1   metros_totales      1728 non-null   float64
 2   antiguedad          1728 non-null   int64  
 3   precio_terreno      1728 non-null   int64  
 4   metros_habitables   1728 non-null   int64  
 5   universitarios      1728 non-null   int64  
 6   dormitorios         1728 non-null   int64  
 7   banyos              1728 non-null   float64
 8   habitaciones        1728 non-null   int64  
 9   aire_acondicionado  1728 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 135.1 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   precio  1728 non-null   int64
dtypes: int64(1)
memory usage: 13.6 KB


01.03       Reparto de data - test y train
            random_state    :
                simplemente fija una semilla para el generador de números aleatorios, lo que permite  reproducir la función. Es decir los registros seleccionados seran los mismos en cada ejecucion para el train y el test.
            shuffle         : 
                (que toma el valor True por defecto) especifica si los registros deberán ser desordenados previamente o no.

In [18]:
# Reparto de datos en train y test
# ==============================================================================
trainSize = 0.80
seed = 1234
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
                                        datos_xx.drop('precio', axis = 'columns'),
                                        datos_yy,
                                        train_size   = trainSize,
                                        random_state = seed,
                                        shuffle      = True
                                    )

In [7]:
X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 1571 to 815
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   metros_totales      1382 non-null   float64
 1   antiguedad          1382 non-null   int64  
 2   precio_terreno      1382 non-null   int64  
 3   metros_habitables   1382 non-null   int64  
 4   universitarios      1382 non-null   int64  
 5   dormitorios         1382 non-null   int64  
 6   banyos              1382 non-null   float64
 7   habitaciones        1382 non-null   int64  
 8   aire_acondicionado  1382 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 108.0 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 903 to 1425
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   metros_totales      346 non-null    float64
 1   antiguedad          346 non-null    int64 

In [8]:
y_train.info()
y_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 1571 to 815
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   precio  1382 non-null   int64
dtypes: int64(1)
memory usage: 21.6 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 903 to 1425
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   precio  346 non-null    int64
dtypes: int64(1)
memory usage: 5.4 KB


01.04       Entrenamiento del Modelo

In [19]:
modelLR.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


01.05       Prediccion de los datos de testeo

In [20]:
targetPredic= modelLR.predict(X_test)

01.06       Muestra de resultados score modelos

In [12]:
modelLR.score(X_train,y_train)

0.01447178002894356

In [16]:
modelLR.score(X_test,y_test)

0.005780346820809248

In [24]:
from sklearn import model_selection
name='Logistic Regression'
kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
cv_results = model_selection.cross_val_score(modelLR, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

In [25]:
print(msg)

Logistic Regression: 0.010135 (0.003559)


In [27]:
from sklearn.metrics import accuracy_score

predictions = modelLR.predict(X_test)
print(accuracy_score( predictions,y_test))

0.005780346820809248


In [31]:
from sklearn.metrics import confusion_matrix

array = confusion_matrix(targetPredic, y_test)
df_cm = pd.DataFrame(array)