In [2]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.diagnostic as diag
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import random
import sklearn.cluster as cluster
from sklearn.metrics import silhouette_samples, silhouette_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
import sklearn.preprocessing
import pyclustertend 
from sklearn.cluster import Birch
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 

In [None]:
# Global Variables
random.seed(123)
number_clusters = 3

# Analisis Exploratorio

In [None]:
# Variable classification
train_data = pd.read_csv('./data/train.csv', encoding = "ISO-8859-1")
test_data = pd.read_csv('./data/test.csv', encoding = "ISO-8859-1")
variables = pd.read_csv('./data/variables.txt', encoding = "ISO-8859-1")
quant_vars = list(variables.loc[(variables['Clasification'] == 'Cuantitativa')]['Variable'].values)
quali_vars = list(variables.loc[(variables['Clasification'] == 'Cualitativa')]['Variable'].values)[1:]

### Analizando las variables numericas

In [None]:
train_data[quant_vars].describe()

In [None]:
for var in quant_vars:
    data = train_data[var].dropna(how='all', axis=0)
    
    # Gráfico
    sns.displot(data, kde=True)

    # Mostrando normalidad
    print('\033[1m' + var + '\033[0m' + ': Kurtosis:', stats.kurtosis(data), 'Skewness:', stats.skew(data), '\n')

### Analizando las variables categoricas

In [None]:
for var in quali_vars:
  plt.figure(figsize=(20,5))
  train_data[var].value_counts().plot(kind='bar')
  plt.show()

 

### Analizando la variable de interes

In [None]:
#skewness and kurtosis
print('Skewness: %f' % train_data['SalePrice'].skew())
print('Kurtosis: %f' % train_data['SalePrice'].kurt())
print('\n---Describe---')
train_data['SalePrice'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, 0.9, 0.95])

In [None]:
stat,p = stats.shapiro(train_data[["SalePrice"]].dropna())
print('Kolmogorov-Smirnov:\np=%f\n'% p)
ks_statistic, p_value = diag.lilliefors(train_data[["SalePrice"]].dropna())
print('Lilliefors:\nks=%f\np=%f'%(ks_statistic,p_value))

In [None]:
sns.displot(train_data['SalePrice'], kde=True)

In [None]:
sns.boxplot(x=train_data["SalePrice"])

In [None]:
k = 10 #number of variables for heatmap
corrmat = train_data.corr()
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train_data[cols], height= 3)
plt.show()

### Missing data

In [None]:
total = train_data.isnull().sum().sort_values(ascending=False)
percent = (train_data.isnull().sum()/train_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

# Analisis de grupos

In [None]:
# Verificar que se puede realizar clustering
# Cualitativa
groups = ['OverallQual', 'YearBuilt']
stamp = groups[0]
aux = train_data.groupby(by=stamp)
tags_list = list(np.array(train_data[[stamp]]))
tag_group = list(aux.groups.keys())
tags = []
tag_to_number = {}
number_to_taga = {}

for i in range(len(tag_group)): 
  tag_to_number[tag_group[i]] = i
  number_to_taga[i] = tag_group[i]

for i in range(len(tags_list)): tags.append(tag_to_number[tags_list[i][0]])

# Cuantitativa
columns_analyze = ['SalePrice', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
cluster_data = train_data[quant_vars].fillna(0)[columns_analyze]
X_scale = sklearn.preprocessing.scale(cluster_data)
pyclustertend.hopkins(X_scale, len(X_scale))

In [None]:
numeroClusters = range(1,10)
wcss = []
# Obtenemos 10 posibles clusters
for i in numeroClusters:
    # Se calcula la kmean con esa cantidad de clusters
    kmeans = cluster.KMeans(n_clusters=i)
    kmeans.fit(X_scale)
    # Obtenemos la inercia
    wcss.append(kmeans.inertia_)

# Graficando
plt.plot(numeroClusters, wcss)
plt.xlabel("Cantidad de clusters")
plt.ylabel("WCSS")
plt.title("Gráfico de Codo")
plt.show()

In [None]:
# Inicializar el modelo
birch_model = Birch(threshold=1.5, n_clusters=number_clusters)
birch_model.fit(X_scale)

# Obtenemos los puntos y los clusters
birch_result = birch_model.predict(X_scale)

# Graficar los clusters
plt.scatter(X_scale[birch_result == 0, 0], X_scale[birch_result == 0, 1], s = 100, c = 'pink', label = "Cluster 1")
plt.scatter(X_scale[birch_result == 1, 0], X_scale[birch_result == 1, 1], s = 100, c = 'purple', label = "Cluster 2")
plt.scatter(X_scale[birch_result == 2, 0], X_scale[birch_result == 2, 1], s = 100, c = 'skyblue', label = "Cluster 3")
plt.title("Metodo de BIRCH")
plt.xlabel(columns_analyze[0])
plt.ylabel(columns_analyze[1])
plt.legend()
plt.show()

In [None]:
import matplotlib.cm as cm
def make_silhouette(clusterer, n_clusters, label):
    fig, ax = plt.subplots(figsize=(1,1))
    fig.set_size_inches(18, 7)

    ax.set_xlim([-0.1, 1])
    ax.set_ylim([0, len(X_scale) + (n_clusters + 1) * 10])

    cluster_labels = clusterer.fit_predict(X_scale)

    silhouette_avg = silhouette_score(X_scale, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score of",
        label,
        'is:',
        silhouette_avg,
    )

    sample_silhouette_values = silhouette_samples(cluster_data, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title(label)
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")

    ax.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax.set_yticks([]) 
    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

make_silhouette(birch_model, number_clusters, 'BIRCH')
plt.show()

In [None]:
# Se realiza el analisis de los grupos
confusion_birch = confusion_matrix(birch_result, tags)[0:number_clusters]

# Se observar como es que estan por categoria
def get_category(confusion_array, label=''):
  print('\nCONFUSION DE:', label)
  keys = list(tag_to_number.keys())
  for i in range(number_clusters):
    print('\nCLUSTER #', i+1)
    result = list(confusion_array[i])
    index = result.index(max(result))
    for j in range(len(keys)):
      print('--> %d pertenece a %s' %(result[j], keys[j]))
    print('Podemos asegurar que es el grupo de: %s con %d' %(keys[index], result[index]))
  
get_category(confusion_birch, 'BIRCH - ' + stamp)

# Creación de modelo

<h3>Inciso 4</h3>

In [None]:
# Selección de percentiles para Caro (75-100), Moderado(25-50), Barato(0-25)
modeling_data = cluster_data
modeling_data['Category'] = 'Caro'
modeling_data.loc[modeling_data['SalePrice']<214000,'Category'] = 'Moderado'
modeling_data.loc[modeling_data['SalePrice']<129975,'Category'] = 'Barato'
print(modeling_data)

In [None]:
modeling_data.groupby(by=["Category"]).size()

In [None]:
# Setting x and y axis
modeling_data['Category'] = modeling_data['Category'].astype('category')
y = modeling_data.pop('Category')
x = modeling_data

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, train_size = 0.7, random_state=13)

# Verificación seed.
print(xtest)

<h3>Inciso 6</h3>

In [None]:
# Decision tree clasifier 
tree_created = DecisionTreeClassifier(max_depth = 3, random_state = 42)
tree_created = tree_created.fit(xtrain, ytrain)
tree.plot_tree(tree_created, feature_names = modeling_data.columns, class_names = ['0', '1', '2'], filled = True)

<h3>Inciso 7</h3>

In [None]:
regretion_data = cluster_data
yr = regretion_data.pop('SalePrice')
xr = regretion_data

In [None]:
xtrainr, xtestr, ytrainr, ytestr = train_test_split(xr, yr, test_size=0.3, train_size = 0.7, random_state=612)

# Verificación seed.
print(xtestr)

In [None]:
# Decision tree regretion (Lo intenté :p)
tree_regretion_created = DecisionTreeClassifier(max_depth = 10, random_state = 45)
tree_regretion_created = tree_regretion_created.fit(xtrainr, ytrainr)
# tree.plot_tree(tree_regretion_created, feature_names = modeling_data.columns, filled = True)

In [None]:
columns_test_analyze = ['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
test_pred = test_data[columns_test_analyze].fillna(0)
price_pred = tree_regretion_created.predict(test_pred)
print(price_pred)