In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway
import statsmodels.api as sm
from scipy.stats import shapiro
from scipy.stats import levene
from sklearn.preprocessing import (OrdinalEncoder, OneHotEncoder, Normalizer, StandardScaler)

treino = pd.read_csv('train.csv') 

treino.describe()

In [None]:
treino.info()

In [None]:
num_cols = treino.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = treino.select_dtypes(include=[object]).columns.tolist()

missing_values = treino.isnull().sum()
missing_percentage = (missing_values / len(treino)) * 100

print("Variáveis Numéricas:\n")
for col in num_cols:
    print(f"  - {col}")

print("\nVariáveis Categóricas:\n")
for col in cat_cols:
    print(f"  - {col}")

print("\nValores Faltantes:\n")
for col, val in missing_values[missing_values > 0].items():
    print(f"  - {col}: {val} valores faltantes ({missing_percentage[col]:.2f}%)")

In [None]:
plt.figure(figsize=(20, 30))
for i, col in enumerate(num_cols):
    plt.subplot(10, 4, i + 1)
    sns.histplot(treino[col], kde=True, bins=30, color='blue')
    plt.title(col)
    plt.tight_layout()
plt.show()

summary_numeric = {}
for col in num_cols:
    summary_numeric[col] = treino[col].describe()

for col, stats in summary_numeric.items():
    print(f"\nEstatísticas para coluna '{col}':\n{stats}")

In [None]:
num_cat_cols = len(cat_cols)
num_rows = (num_cat_cols // 2) + (num_cat_cols % 2)
plt.figure(figsize=(20, num_rows * 6))

for i, col in enumerate(cat_cols, 1):
    plt.subplot(num_rows, 2, i)
    sns.histplot(x=col, data=treino, color='blue')
    plt.title(f'Distribuição de {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()

plt.show()

summary_categorical = {}
for col in cat_cols:
    summary_categorical[col] = treino[col].describe()

for col, stats in summary_categorical.items():
    print(f"\nEstatísticas para coluna '{col}':\n{stats}")

In [None]:
correlations = treino[num_cols].corr()['SalePrice'].sort_values(ascending=False)

top_correlations = correlations.head(11)
print("Top 10 correlações com SalePrice:\n", top_correlations)

bottom_correlations = correlations.tail(10)
print("\nBottom 10 correlações com SalePrice:\n", bottom_correlations)

In [None]:
anova_results = []

for col in cat_cols:
    categories = treino[col].unique()
    
    if all(treino[treino[col] == category]['SalePrice'].count() > 1 for category in categories):
        try:
            anova = f_oneway(*[treino[treino[col] == category]['SalePrice'] for category in categories])
            anova_results.append((col, anova.statistic, anova.pvalue))
        except Exception as e:
            print(f"Erro ao calcular ANOVA para {col}: {e}")

df_anova = pd.DataFrame(anova_results, columns=['Variável Categórica', 'F-statistic', 'p-value'])

df_anova.sort_values(by='p-value', ascending=True, inplace=True)

top_10_correlations = df_anova.head(10)

print("As 10 maiores correlações:")
print(top_10_correlations)

bottom_10_correlations = df_anova.tail(10)

print("\nAs 10 menores correlações:")
print(bottom_10_correlations)

In [None]:
correlation_matrix = treino[num_cols].corr()
plt.figure(figsize=(22, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Matriz de Correlação')
plt.show()

In [None]:
s_palette = 'viridis'

In [None]:
plt.figure(figsize=(20, 6))
sns.barplot(x='Neighborhood', y='SalePrice', data=treino, palette='coolwarm', errorbar=None)
plt.title('SalePrice por Neighbourhood')
plt.xlabel('Neighbourhood')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='OverallQual', y='SalePrice', data=treino, palette='YlOrBr', errorbar=None)
plt.title('SalePrice por OverallQual')
plt.xlabel('OverallQual')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='GrLivArea', y='SalePrice', data=treino, alpha=0.7 , hue='SalePrice', palette=s_palette)
plt.title('SalePrice por GrLivArea')
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')
plt.legend().remove()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='GarageCars', y='SalePrice', data=treino, palette='Blues', errorbar=None)
plt.title('SalePrice por GarageCars')
plt.xlabel('GarageCars')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='GarageArea', y='SalePrice', data=treino, color='blue')
plt.title('SalePrice por GarageArea (Line Plot)')
plt.xlabel('GarageArea')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='TotalBsmtSF', y='SalePrice', data=treino, color='blue')
plt.title('SalePrice por TotalBsmtBF')
plt.xlabel('TotalBsmtBF')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='1stFlrSF', y='SalePrice', data=treino, alpha=0.7 , hue='SalePrice', palette=s_palette)
plt.title('SalePrice por 1stFlrSF')
plt.xlabel('1stFlrSF')
plt.ylabel('SalePrice')
plt.legend().remove()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='FullBath', y='SalePrice', data=treino, palette='magma', errorbar=None)
plt.title('SalePrice por FullBath')
plt.xlabel('FullBath')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='TotRmsAbvGrd', y='SalePrice', data=treino, palette='Spectral', errorbar=None)
plt.title('SalePrice por TotRmsAbvGrd')
plt.xlabel('TotRmsAbvGrd')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='YearBuilt', y='SalePrice', data=treino, alpha=0.7, hue='SalePrice', palette=s_palette)
plt.title('SalePrice por YearBuilt')
plt.xlabel('YearBuilt')
plt.ylabel('SalePrice')
plt.legend().remove()  
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='YearRemodAdd', y='SalePrice', data=treino, alpha=0.7 , hue='SalePrice', palette=s_palette)
plt.title('SalePrice por YearRemodAdd')
plt.xlabel('YearRemodAdd')
plt.ylabel('SalePrice')
plt.legend().remove()
plt.show()

In [None]:
top_cat_cols = [
    'ExterQual', 'KitchenQual', 'Foundation', 
    'SaleCondition', 'SaleType', 'MSZoning', 'HouseStyle', 
    'LotShape', 'CentralAir'
]

filtered_data = treino[top_cat_cols + ['SalePrice']]

plt.figure(figsize=(18, 12))
for i, feature in enumerate(top_cat_cols, 1):
    plt.subplot(3, 4, i)
    sns.barplot(x=feature, y='SalePrice', data=filtered_data, palette='mako', errorbar=None)
    plt.title(f'SalePrice por {feature}')
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
neighborhood_prices = treino.groupby('Neighborhood')['SalePrice'].mean().sort_values(ascending=False)

top_neighborhoods = neighborhood_prices.head(5).index.tolist()
print("Bairros com maior preço médio:")
print(top_neighborhoods)

In [None]:
filtered_data = treino[treino['Neighborhood'].isin(top_neighborhoods)]

variables_of_interest = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd' ,
    'ExterQual', 'KitchenQual', 'Foundation', 'HeatingQC', 'SaleCondition'
]

num_plots = len(variables_of_interest)
rows = (num_plots // 4) + 1

plt.figure(figsize=(18, 12))
for i, feature in enumerate(variables_of_interest, 1):
    plt.subplot(rows, 4, i)
    if treino[feature].dtype == 'object':  
        sns.histplot(filtered_data[feature], bins=20, kde=True, stat='density')
        plt.xlabel(feature)
        plt.ylabel('Contagem')
    else:
        sns.histplot(filtered_data[feature], bins=20, kde=True, stat='density')
        plt.xlabel(feature)
        plt.ylabel('Densidade')
    plt.title(f'{feature} - Bairros com Maior Preço Médio')
    
plt.tight_layout()
plt.show()

In [None]:
neighborhood_prices = treino.groupby('Neighborhood')['SalePrice'].mean().sort_values(ascending=True)

bottom_neighborhoods = neighborhood_prices.head(5).index.tolist()
print("Bairros com menor preço médio:")
print(bottom_neighborhoods)

In [None]:
filtered_data = treino[treino['Neighborhood'].isin(bottom_neighborhoods)]

variables_of_interest = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
    'ExterQual', 'KitchenQual', 'Foundation', 'HeatingQC', 'SaleCondition'
]

num_plots = len(variables_of_interest)
rows = (num_plots // 4) + 1

plt.figure(figsize=(18, 12))
for i, feature in enumerate(variables_of_interest, 1):
    plt.subplot(rows, 4, i)
    if treino[feature].dtype == 'object':  
        sns.histplot(filtered_data[feature], bins=20, kde=True, stat='density')
        plt.xlabel(feature)
        plt.ylabel('Média de SalePrice')
    else:
        sns.histplot(filtered_data[feature], bins=20, kde=True, stat='density')
        plt.xlabel(feature)
        plt.ylabel('Densidade')
    plt.title(f'{feature} - Bairros com Menor Preço Médio')
    
plt.tight_layout()
plt.show()

In [None]:
shapiro_results = {}
for col in num_cols:
    data = treino[col].dropna() 
    stat, p_value = shapiro(data)
    shapiro_results[col] = {'Estatística': stat, 'Valor p': p_value}

for col, result in shapiro_results.items():
    print(f"\nTeste de Shapiro-Wilk para '{col}':")
    print(f"  Estatística: {result['Estatística']}")
    print(f"  Valor p: {result['Valor p']}")

In [None]:
levene_results = {}

for col in num_cols:
    stat, p_value = levene(treino[col].dropna(), treino['SalePrice'].dropna())
    levene_results[col] = {'Estatística': stat, 'Valor p': p_value}

for col, result in levene_results.items():
    print(f"\nTeste de Levene para '{col}':")
    print(f"  Estatística: {result['Estatística']}")
    print(f"  Valor p: {result['Valor p']}")

In [None]:
null_indicatores_num = treino[num_cols].isnull().astype(int)
null_indicatores_num['SalePrice'] = treino['SalePrice']

null_correlacao = null_indicatores_num.corr()['SalePrice'].sort_values(ascending=False)

maiores_corr_numer = null_correlacao.head(10) 
menores_corr_numer = null_correlacao.tail(10)

print("Top 10 maiores correlações entre valores nulos e SalePrice:\n", maiores_corr_numer)
print("\nTop 10 menores correlações entre valores nulos e SalePrice:\n", menores_corr_numer)

In [None]:
plt.figure(figsize=(10, 6))
maiores_corr_numer[1:].plot(kind='bar', color='green')
plt.title('Top 10 maiores correlações entre valores nulos (numéricas) e SalePrice')
plt.ylabel('Correlação')
plt.xlabel('Colunas')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
null_indicatores_cat = treino[cat_cols].isnull().astype(int)
null_indicatores_cat['SalePrice'] = treino['SalePrice']

null_correlacao_cat = null_indicatores_cat.corr()['SalePrice'].sort_values(ascending=False)

maiores_corr_cat = null_correlacao_cat.head(20)
menores_corr_cat = null_correlacao_cat.tail(10)

print("Maiores correlações entre valores nulos (categóricas) e SalePrice:\n", maiores_corr_cat)
print("\nMenores correlações entre valores nulos (categóricas) e SalePrice:\n", menores_corr_cat)

In [None]:
plt.figure(figsize=(10, 6))
maiores_corr_cat[1:].plot(kind='bar', color='blue')
plt.title('Top 10 maiores correlações entre valores nulos (categóricas) e SalePrice')
plt.ylabel('Correlação')
plt.xlabel('Colunas')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()