In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [5]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
df["MSZoning"] = le.fit_transform(df["MSZoning"])
df["Street"] = le.fit_transform(df["Street"])
df["Alley"] = le.fit_transform(df["Alley"])
df["LotShape"] = le.fit_transform(df["LotShape"])
df["LandContour"] = le.fit_transform(df["LandContour"])
df["Utilities"] = le.fit_transform(df["Utilities"])
df["LotConfig"] = le.fit_transform(df["LotConfig"])
df["LandSlope"] = df["LandSlope"].map({
    "Gtl": 1,
    "Mod": 2,
    "Sev": 3
})
df["Neighborhood"] = le.fit_transform(df["Neighborhood"])
df["Condition1"] = le.fit_transform(df["Condition1"])
df["Condition2"] = le.fit_transform(df["Condition2"])
df["BldgType"] = le.fit_transform(df["BldgType"])
df["HouseStyle"] = le.fit_transform(df["HouseStyle"])
df["RoofStyle"] = le.fit_transform(df["RoofStyle"])
df["RoofMatl"] = le.fit_transform(df["RoofMatl"])
df["Exterior1st"] = le.fit_transform(df["Exterior1st"])
df["Exterior2nd"] = le.fit_transform(df["Exterior2nd"])
df["MasVnrType"] = le.fit_transform(df["MasVnrType"])

condition_map = {
    "Ex": 5, 
    "Gd": 4, 
    "TA": 3, 
    "Fa": 2, 
    "Po": 1, 
    "NA": 0,
}
df["ExterQual"] = df["ExterQual"].map(condition_map)
df["ExterCond"] = df["ExterCond"].map(condition_map)
df["Foundation"] = le.fit_transform(df["Foundation"])
df["BsmtQual"] = df["BsmtQual"].map(condition_map)
df["BsmtCond"] = df["BsmtCond"].map(condition_map)
df["BsmtExposure"] = df["BsmtExposure"].map({
    "Gd": 4, 
    "Av": 3, 
    "Mn": 2, 
    "No": 1, 
    "NA": 0
})
df["BsmtFinType1"] = df["BsmtFinType1"].map({
    "GLQ": 6, 
    "ALQ": 5, 
    "BLQ": 4, 
    "Rec": 3, 
    "LwQ": 2, 
    "Unf": 1, 
    "NA": 0
})
df["BsmtFinType2"] = df["BsmtFinType2"].map({
    "GLQ": 6, 
    "ALQ": 5, 
    "BLQ": 4, 
    "Rec": 3, 
    "LwQ": 2, 
    "Unf": 1, 
    "NA": 0
})
df["Heating"] = le.fit_transform(df["Heating"])
df["HeatingQC"] = df["HeatingQC"].map(condition_map)
df["CentralAir"] = le.fit_transform(df["CentralAir"])
df["Electrical"] = df["Electrical"].map({
    "SBrkr": 5, 
    "FuseA": 4, 
    "FuseF": 3, 
    "FuseP": 2, 
    "Mix": 1, 
    "NA": 0
})
df["KitchenQual"] = df["KitchenQual"].map(condition_map)
df["Functional"] = le.fit_transform(df["Functional"])
df["FireplaceQu"] = df["FireplaceQu"].map(condition_map)
df["GarageType"] = le.fit_transform(df["GarageType"])
df["GarageFinish"] = df["GarageFinish"].map({
    "Fin": 3, 
    "RFn": 2, 
    "Unf": 1, 
    "NA": 0
})
df["GarageQual"] = df["GarageQual"].map(condition_map)
df["GarageCond"] = df["GarageCond"].map(condition_map)
df["PavedDrive"] = df["PavedDrive"].map({
    "Y": 3, 
    "P": 2, 
    "N": 1,
    "NA": 0
})
df["PoolQC"] = df["PoolQC"].map(condition_map)
df["Fence"] = df["Fence"].map({
    "GdPrv": 4, 
    "MnPrv": 3, 
    "GdWo": 2, 
    "MnWw": 1, 
    "NA": 0
})
df["MiscFeature"] = le.fit_transform(df["MiscFeature"])
df["SaleType"] = le.fit_transform(df["SaleType"])
df["SaleCondition"] = le.fit_transform(df["SaleCondition"])

for col in df.columns:
    if df[col].isna().sum() > 0:
        df[col] = df[col].fillna(0)


In [8]:
price_class_df = df.copy()
lower_limit = np.percentile(price_class_df['SalePrice'], 33)
upper_limit = np.percentile(price_class_df['SalePrice'], 66)

def classify_house(price):
    if price < lower_limit:
        return 'Economical'
    elif price < upper_limit:
        return 'Intermediate'
    else:
        return 'Expensive'

price_class_df['Classification'] = price_class_df['SalePrice'].apply(classify_house)

print(price_class_df[['SalePrice', 'Classification']].head())

   SalePrice Classification
0     208500      Expensive
1     181500   Intermediate
2     223500      Expensive
3     140000   Intermediate
4     250000      Expensive


In [9]:
price_class_df['IsExpensive'] = (price_class_df['Classification'] == 'Expensive').astype(int)
price_class_df['IsIntermediate'] = (price_class_df['Classification'] == 'Intermediate').astype(int)
price_class_df['IsEconomical'] = (price_class_df['Classification'] == 'Economical').astype(int)

In [10]:
print(price_class_df[['Classification', 'IsExpensive', 'IsIntermediate', 'IsEconomical']].head())

  Classification  IsExpensive  IsIntermediate  IsEconomical
0      Expensive            1               0             0
1   Intermediate            0               1             0
2      Expensive            1               0             0
3   Intermediate            0               1             0
4      Expensive            1               0             0


## 2

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np

In [12]:
X = price_class_df.drop(columns=["Id", "SalePrice", "Classification", "IsExpensive", "IsIntermediate", "IsEconomical"])
y = price_class_df["IsExpensive"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
gnb = GaussianNB()

In [15]:
gnb.fit(X_train, y_train)

In [16]:
y_pred = gnb.predict(X_test)

In [17]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = gnb.score(X_test, y_test)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 0.40119683960568414
R2: 0.839041095890411


## 3

In [18]:
model = LogisticRegression(solver='liblinear', max_iter=1000)

In [19]:
model.fit(X_train, y_train)
scores = cross_val_score(model, X_train, y_train, cv=5)

In [20]:
print("Precisión de validación cruzada en cada pliegue: ", scores)
print("Precisión media de validación cruzada: ", scores.mean())

Precisión de validación cruzada en cada pliegue:  [0.91452991 0.88888889 0.94017094 0.90128755 0.93562232]
Precisión media de validación cruzada:  0.9160999229668757


# 4

In [22]:
# Calculo de VIF para cada variable
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm

# Suponiendo que X_train es tu conjunto de entrenamiento de variables independientes
X_train_const = add_constant(X_train)

vif_data = pd.DataFrame()
vif_data['feature'] = X_train_const.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_const.values, i) for i in range(X_train_const.shape[1])]

print(vif_data)


  vif = 1. / (1. - r_squared_i)


          feature           VIF
0           const  2.599763e+06
1      MSSubClass  5.533222e+00
2        MSZoning  1.400416e+00
3     LotFrontage  1.402632e+00
4         LotArea  1.776529e+00
..            ...           ...
75        MiscVal  1.693587e+00
76         MoSold  1.096128e+00
77         YrSold  1.103613e+00
78       SaleType  1.164254e+00
79  SaleCondition  1.247673e+00

[80 rows x 2 columns]


In [23]:
#Ajuste del modelo de regresión logística y examinación de los p-values de las variables
logit_model = sm.Logit(y_train, X_train_const)
logit_result = logit_model.fit()

print(logit_result.summary())


         Current function value: 0.175716
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            IsExpensive   No. Observations:                 1168
Model:                          Logit   Df Residuals:                     1088
Method:                           MLE   Df Model:                           79
Date:                Mon, 08 Apr 2024   Pseudo R-squ.:                  0.7271
Time:                        16:35:10   Log-Likelihood:                -205.24
converged:                      False   LL-Null:                       -751.92
Covariance Type:            nonrobust   LLR p-value:                3.161e-178
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const          -278.1675    219.475     -1.267      0.205    -708.330     151.995
MSSubClass       -0.0269      0.014     -1.882      0.06



In [25]:
#Evaluación el ajuste del modelo
from sklearn.metrics import accuracy_score, roc_auc_score

# Hacer predicciones con el conjunto de prueba
y_pred = logit_model.predict(X_train_const)

# Calcular la precisión y AUC
accuracy = accuracy_score(y_test, y_pred.round())
auc = roc_auc_score(y_test, y_pred)

print(f'Precisión: {accuracy}')
print(f'AUC: {auc}')


ValueError: shapes (1168,80) and (292,79) not aligned: 80 (dim 1) != 292 (dim 0)