In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.stats.diagnostic as diag
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

# Analisis Exploratorio

In [None]:
# Variable classification
train_data = pd.read_csv('./data/train.csv', encoding = "ISO-8859-1")
test_data = pd.read_csv('./data/test.csv', encoding = "ISO-8859-1")
variables = pd.read_csv('./data/variables.txt', encoding = "ISO-8859-1")
quant_vars = list(variables.loc[(variables['Clasification'] == 'Cuantitativa')]['Variable'].values)
quali_vars = list(variables.loc[(variables['Clasification'] == 'Cualitativa')]['Variable'].values)[1:]

### Analizando las variables numericas

In [None]:
train_data[quant_vars].describe()

In [None]:
for var in quant_vars:
    data = train_data[var].dropna(how='all', axis=0)
    
    # Gráfico
    sns.displot(data, kde=True)

    # Mostrando normalidad
    print('\033[1m' + var + '\033[0m' + ': Kurtosis:', stats.kurtosis(data), 'Skewness:', stats.skew(data), '\n')

### Analizando las variables categoricas

In [None]:
for var in quali_vars:
  plt.figure(figsize=(20,5))
  train_data[var].value_counts().plot(kind='bar')
  plt.show()

### Analizando la variable de interes

In [None]:
#skewness and kurtosis
print('Skewness: %f' % train_data['SalePrice'].skew())
print('Kurtosis: %f' % train_data['SalePrice'].kurt())
print('\n---Describe---')
train_data['SalePrice'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, 0.9, 0.95])

In [None]:
stat,p = stats.shapiro(train_data[["SalePrice"]].dropna())
print('Kolmogorov-Smirnov:\np=%f\n'% p)
ks_statistic, p_value = diag.lilliefors(train_data[["SalePrice"]].dropna())
print('Lilliefors:\nks=%f\np=%f'%(ks_statistic,p_value))

In [None]:
sns.displot(train_data['SalePrice'], kde=True)

### Correlacion

In [None]:
k = 10 #number of variables for heatmap
corrmat = train_data.corr()
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

### Obteniendo la relacion entre las variables mas significativas

In [None]:
sns.pairplot(train_data[cols],hue="SalePrice")
plt.show()
# quant_vars

# Analizando data

In [None]:
def categorize(row): 
    if row['SalePrice'] > 0 and row['SalePrice'] <= 179280:
        return 'Low'
    elif row['SalePrice'] > 179280 and row['SalePrice'] < 326100:
        return 'Medium'
    else:
        return 'Expensive'
train_data['SalesCategories'] = train_data.apply(lambda row: categorize(row), axis=1)

# Global Variables
seed = random.seed(123)

# Preprocesamiento

In [None]:
copied_train_data = train_data.copy()
copied_train_data = copied_train_data.fillna(0)

target = copied_train_data.pop('SalesCategories')
data = copied_train_data[quant_vars]

In [None]:
categoricas = copied_train_data[quali_vars]

70% de entrenamiento y 30% prueba

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target,test_size=0.3,train_size=0.7, random_state=42)

## Creando el modelo

In [None]:
scaler = StandardScaler()
scaler.fit(data)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(10,8), max_iter=1000)
mlp.fit(X_train,y_train)
y_pred = mlp.predict(X_test)

#### Resultados esperados

In [None]:
cm = confusion_matrix(y_test,y_pred)
accuracy=accuracy_score(y_test,y_pred)
precision =precision_score(y_test, y_pred,average='micro')
recall =  recall_score(y_test, y_pred,average='micro')
f1 = f1_score(y_test,y_pred,average='micro')
print('Matriz de confusión para categorizar las casas\n',cm)
print('Accuracy: ',accuracy)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(8,4), max_iter=1000, activation='logistic')
mlp.fit(X_train,y_train)
y_pred = mlp.predict(X_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)
accuracy=accuracy_score(y_test,y_pred)
precision =precision_score(y_test, y_pred,average='micro')
recall =  recall_score(y_test, y_pred,average='micro')
f1 = f1_score(y_test,y_pred,average='micro')
print('Matriz de confusión para categorizar las casas\n',cm)
print('Accuracy: ',accuracy)

# Preprocesamiento SalePrice

In [None]:
copied_train_data = train_data.copy()
copied_train_data = copied_train_data.fillna(0)


target = copied_train_data.pop('SalePrice')

In [None]:
data = copied_train_data[quant_vars]
categoricas = copied_train_data[quali_vars]

70% de entrenamiento y 30% prueba

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target,test_size=0.3,train_size=0.7, random_state=42)

## Creando el modelo

In [None]:
mlp = MLPRegressor(random_state=1, max_iter=500)
mlp.fit(X_train,y_train)
mlp.predict(X_test)
print("")

#### Resultados esperados

In [None]:
mlp.score(X_test, y_test)

In [None]:
mlp = MLPRegressor(random_state=1, max_iter=500, hidden_layer_sizes=(3,4), activation='identity')
mlp.fit(X_train,y_train)
mlp.predict(X_test)
print("")

In [None]:
mlp.score(X_test, y_test)