### Extração dos Dados

In [None]:
!unzip "/content/house-prices-advanced-regression-techniques.zip" -d "files"

Archive:  /content/house-prices-advanced-regression-techniques.zip
  inflating: files/data_description.txt  
  inflating: files/sample_submission.csv  
  inflating: files/test.csv          
  inflating: files/train.csv         


### Bibliotecas Usadas

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

### AED

In [3]:
dataset_treino = pd.read_csv("files/train.csv")

In [4]:
dataset_treino = dataset_treino.drop("Id", axis=1)

total_valores_nulos = dataset_treino.isnull().sum()
porcentagem = (total_valores_nulos / dataset_treino.shape[0]) * 100

colunas_extrapolantes = dataset_treino.columns[porcentagem > 10]
dataset_treino = dataset_treino.drop(colunas_extrapolantes, axis=1)

In [5]:
dataset_teste = pd.read_csv("files/test.csv")

In [6]:
dataset_teste2 = dataset_teste.drop("Id", axis=1)

total_valores_nulos = dataset_teste2.isnull().sum()
porcentagem = (total_valores_nulos / dataset_teste2.shape[0]) * 100

colunas_extrapolantes = dataset_teste2.columns[porcentagem > 10]
dataset_teste2 = dataset_teste2.drop(colunas_extrapolantes, axis=1)

### Feature Engineering

In [7]:
# Dataset de Treino

colunas_categoricas = dataset_treino.columns[dataset_treino.dtypes == "object"]

label_encoder = LabelEncoder()

for coluna in colunas_categoricas:
    dataset_treino[coluna] = label_encoder.fit_transform(dataset_treino[coluna])

dataset_treino = dataset_treino.fillna(-1)

In [8]:
# Dataset de Testes

colunas_categoricas = dataset_teste2.columns[dataset_teste2.dtypes == "object"]

label_encoder = LabelEncoder()

for coluna in colunas_categoricas:
    dataset_teste2[coluna] = label_encoder.fit_transform(dataset_teste2[coluna])

dataset_teste2 = dataset_teste2.fillna(-1)

### Aprendizagem Supervisionada

In [9]:
# Treino do Modelo de Regressão Linear
X = dataset_treino.drop("SalePrice", axis=1)
y = dataset_treino["SalePrice"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

modelo_regressao_linear = LinearRegression().fit(X_train, y_train)

previsao = modelo_regressao_linear.predict(X_test)

print(f"MAE: {mean_absolute_error(y_test, previsao):.2f}")
print(f"MSE: {mean_squared_error(y_test, previsao):.2f}")

MAE: 21069.44
MSE: 1126507405.52


In [11]:
# Prevendo para o Dataset de Teste
previsao_teste = modelo_regressao_linear.predict(dataset_teste2)
dataset_teste["SalePrice"] = previsao_teste

### Aprendizagem Não Supervisionada

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

colunas = dataset_teste2.columns

data = pd.get_dummies(dataset_teste2, columns=colunas)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)

data['Cluster'] = kmeans.labels_

score = silhouette_score(scaled_data, kmeans.labels_)
print(f'Silhouette Score: {score}')

In [16]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
pc_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pc_df['Cluster'] = kmeans.labels_

In [17]:
from mlxtend.frequent_patterns import apriori, association_rules

binary_data = data.copy()
binary_data = binary_data.applymap(lambda x: 1 if x > 0 else 0)

frequent_itemsets = apriori(binary_data, min_support=0.8, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)

print(rules)



             antecedents                                        consequents  \
0        (LandContour_3)                                         (Street_1)   
1             (Street_1)                                    (LandContour_3)   
2          (Utilities_0)                                         (Street_1)   
3             (Street_1)                                      (Utilities_0)   
4          (LandSlope_0)                                         (Street_1)   
...                  ...                                                ...   
5724775   (GarageCond_4)  (MiscVal_0, Condition2_2, Heating_0, GarageQua...   
5724776     (PoolArea_0)  (MiscVal_0, Condition2_2, Heating_0, GarageQua...   
5724777     (RoofMatl_0)  (MiscVal_0, Condition2_2, Heating_0, GarageQua...   
5724778    (3SsnPorch_0)  (MiscVal_0, Condition2_2, Heating_0, GarageQua...   
5724779       (Street_1)  (MiscVal_0, Condition2_2, Heating_0, GarageQua...   

         antecedent support  consequent support   s

In [18]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=20)
data['Outlier'] = lof.fit_predict(scaled_data)
outliers = data[data['Outlier'] == -1]
print(outliers)

  and should_run_async(code)


      MSSubClass_20  MSSubClass_30  MSSubClass_40  MSSubClass_45  \
1              True          False          False          False   
5             False          False          False          False   
15            False          False          False          False   
16             True          False          False          False   
17             True          False          False          False   
...             ...            ...            ...            ...   
1441           True          False          False          False   
1442           True          False          False          False   
1443           True          False          False          False   
1444           True          False          False          False   
1458          False          False          False          False   

      MSSubClass_50  MSSubClass_60  MSSubClass_70  MSSubClass_75  \
1             False          False          False          False   
5             False           True          Fal