In [4]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.stats.diagnostic as diag
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, KFold
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from scipy.stats import normaltest
from yellowbrick.regressor import ResidualsPlot
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

In [5]:
# Global Variables
seed = random.seed(123)
number_clusters = 3

# Analisis Exploratorio

In [6]:
# Variable classification
train_data = pd.read_csv('./data/train.csv', encoding = "ISO-8859-1")
test_data = pd.read_csv('./data/test.csv', encoding = "ISO-8859-1")
variables = pd.read_csv('./data/variables.txt', encoding = "ISO-8859-1")
quant_vars = list(variables.loc[(variables['Clasification'] == 'Cuantitativa')]['Variable'].values)
quali_vars = list(variables.loc[(variables['Clasification'] == 'Cualitativa')]['Variable'].values)[1:]

### Analizando las variables numericas

In [None]:
train_data[quant_vars].describe()

In [None]:
for var in quant_vars:
    data = train_data[var].dropna(how='all', axis=0)
    
    # Gráfico
    sns.displot(data, kde=True)

    # Mostrando normalidad
    print('\033[1m' + var + '\033[0m' + ': Kurtosis:', stats.kurtosis(data), 'Skewness:', stats.skew(data), '\n')

### Analizando las variables categoricas

In [None]:
for var in quali_vars:
  plt.figure(figsize=(20,5))
  train_data[var].value_counts().plot(kind='bar')
  plt.show()

### Analizando la variable de interes

In [None]:
#skewness and kurtosis
print('Skewness: %f' % train_data['SalePrice'].skew())
print('Kurtosis: %f' % train_data['SalePrice'].kurt())
print('\n---Describe---')
train_data['SalePrice'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, 0.9, 0.95])

In [None]:
stat,p = stats.shapiro(train_data[["SalePrice"]].dropna())
print('Kolmogorov-Smirnov:\np=%f\n'% p)
ks_statistic, p_value = diag.lilliefors(train_data[["SalePrice"]].dropna())
print('Lilliefors:\nks=%f\np=%f'%(ks_statistic,p_value))

In [None]:
sns.displot(train_data['SalePrice'], kde=True)

### Correlacion

In [None]:
k = 10 #number of variables for heatmap
corrmat = train_data.corr()
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

### Obteniendo la relacion entre las variables mas significativas

In [None]:
sns.pairplot(train_data[cols],hue="SalePrice")
plt.show()
# quant_vars

# Analizando data

In [17]:
def categorize(row, option): 
    if (option == 0):
        if row['SalePrice'] > 0 and row['SalePrice'] <= 179280:
            return 1
        return 0
    elif (option == 1):
        if (row['SalePrice'] > 179280 and row['SalePrice'] < 326100):
            return 1
        return 0
    elif (option == 2):
        if (row['SalePrice'] >= 326100):
            return 1
        return 0

train_data['isMedium'] = train_data.apply(lambda row: categorize(row, 1), axis=1)
train_data['isExpensive'] = train_data.apply(lambda row: categorize(row, 2), axis=1)

train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,isMedium,isExpensive
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,0,2,2008,WD,Normal,208500,1,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,0,5,2007,WD,Normal,181500,1,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,0,9,2008,WD,Normal,223500,1,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,0,2,2006,WD,Abnorml,140000,0,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,0,12,2008,WD,Normal,250000,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,,,0,8,2007,WD,Normal,175000,0,0
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,MnPrv,,0,2,2010,WD,Normal,210000,1,0
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,GdPrv,Shed,2500,5,2010,WD,Normal,266500,1,0
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,,,0,4,2010,WD,Normal,142125,0,0


In [18]:
# isLow isMedium isExpensive
result = 'isMedium'

In [19]:
copied_train_data = train_data.copy()
copied_train_data = copied_train_data.fillna(0)


y = copied_train_data.pop(result) #La variable respuesta
X = copied_train_data[quant_vars] #El resto de los datos

70% de entrenamiento y 30% prueba

In [20]:
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size=0.3,train_size=0.7, random_state=42)

## Creando el modelo

In [23]:
logReg = LogisticRegression(solver='liblinear')
logReg.fit(X_train,y_train)
y_pred = logReg.predict(X_test)
cm = confusion_matrix(y_test,y_pred)

#### Resultados esperados

In [24]:
accuracy=accuracy_score(y_test,y_pred)
precision =precision_score(y_test, y_pred,average='micro')
recall =  recall_score(y_test, y_pred,average='micro')
f1 = f1_score(y_test,y_pred,average='micro')
print('Matriz de confusión para detectar virginica\n',cm)
print('Accuracy: ',accuracy)

Matriz de confusión para detectar virginica
 [[253  40]
 [ 70  75]]
Accuracy:  0.7488584474885844
