In [4]:
import numpy as np
import pandas as pd

In [5]:
Data = pd.read_csv('House_Price_Regression.csv')

In [6]:
Data.head()

Unnamed: 0,neighborhood_recode,latitude,longitude,price,no_of_bedrooms,no_of_bathrooms,quality_recode,maid_room_recode,unfurnished_recode,balcony_recode,...,private_garden_recode,private_gym_recode,private_jacuzzi_recode,private_pool_recode,security_recode,shared_gym_recode,shared_pool_recode,shared_spa_recode,view_of_water_recode,size_in_m_2
0,46.0,25.113208,55.138932,2700000,1,2,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,100.242337
1,46.0,25.106809,55.151201,2850000,2,2,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,146.972546
2,36.0,25.063302,55.137728,1150000,3,5,2.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,181.253753
3,11.0,25.227295,55.341761,2850000,2,3,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187.66406
4,46.0,25.114275,55.139764,1729200,0,1,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,47.101821


## Intervalo de confianza para la proporción

### Intervalo para la variable Balcony y para Quality

In [7]:
import scipy

In [8]:
def CI_Proportion(Variable , alpha=0.05):

    z_alpha_medios = scipy.stats.norm.ppf( 1 - alpha/2 , loc=0, scale=1)

    X_mean = Variable.mean()

    n = len(Variable)

    L1 = X_mean - z_alpha_medios * np.sqrt(X_mean*(1-X_mean)/n)

    L2 = X_mean + z_alpha_medios * np.sqrt(X_mean*(1-X_mean)/n)

    interval = [L1 , L2]

    return interval , X_mean

In [9]:
Balcony = Data.balcony_recode # 1 = true , 0 = false

In [10]:
intervalo , proporcion = CI_Proportion(Variable=Balcony , alpha=0.05)

In [11]:
intervalo

[0.6995155465832295, 0.7398545321569281]

In [12]:
proporcion

0.7196850393700788

In [13]:
Quality_0 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,0] # 1 = quality low , 0 = not low
Quality_1 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,1] # 1 = quality medium , 0 = not medium
Quality_2 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,2] # 1 = quality high , 0 = not high
Quality_3 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,3] # 1 = quality ultra , 0 = not ultra

In [14]:
intervalo , proporcion = CI_Proportion(Variable=Quality_0 , alpha=0.05)

In [15]:
intervalo

[0.05885789175110899, 0.08182452294705374]

In [16]:
proporcion

0.07034120734908136

In [17]:
intervalo , proporcion = CI_Proportion(Variable=Quality_1 , alpha=0.05)

In [18]:
intervalo

[0.26528121372661256, 0.30584739519727194]

In [19]:
proporcion

0.28556430446194225

In [20]:
intervalo , proporcion = CI_Proportion(Variable=Quality_2 , alpha=0.05)

In [21]:
intervalo

[0.5795901905178868, 0.6235594157813258]

In [22]:
proporcion

0.6015748031496063

In [23]:
intervalo , proporcion = CI_Proportion(Variable=Quality_3 , alpha=0.05)

In [24]:
intervalo

[0.03345901015224513, 0.051580359926495026]

In [25]:
proporcion

0.04251968503937008

### Intervalo de confianza para la media con varianza desconocida

In [26]:
def CI_Mean(Variable , alpha=0.05):

    n = len(Variable)

    t_alpha_medios = scipy.stats.t.ppf( 1 - alpha/2 , df=n-1)

    X_mean = Variable.mean()

    X_cuasi_var = Variable.std()**2 
    
    # std() esta definida por defecto como la cuasi-desviacion-tipica

    L1 = X_mean - t_alpha_medios * np.sqrt(X_cuasi_var/n)

    L2 = X_mean + t_alpha_medios * np.sqrt(X_cuasi_var/n)

    interval = [L1 , L2]

    return interval , X_mean

In [27]:
Price = Data.price

In [28]:
intervalo , media = CI_Mean(Variable=Price , alpha=0.05)

In [29]:
intervalo

[1954927.6388481215, 2216732.108133506]

In [30]:
media

2085829.8734908137

### Intervalo de confianza para la varianza

In [47]:
def CI_Variance(Variable , alpha=0.05):

    n = len(Variable)

    chi_alpha_medios = scipy.stats.chi.ppf( 1 - alpha/2 , df=n-1)

    chi_1_alpha_medios = scipy.stats.chi.ppf(alpha/2 , df=n-1)

    X_cuasi_var = Variable.std()**2 

    X_var = ( (n-1)/n )*X_cuasi_var
    
    # std() esta definida por defecto como la cuasi-desviacion-tipica

    L1 =  (n*X_var) / chi_alpha_medios

    L2 = (n*X_var) / chi_1_alpha_medios

    interval = [L1 , L2]

    return interval , X_var 

In [48]:
Price.std()**2

8486734015707.785

In [49]:
intervalo , varianza = CI_Variance(Variable=Price , alpha=0.05)

In [50]:
intervalo

[358920965766270.5, 382466627578853.0]

In [51]:
varianza

8482279037221.85

El intervalo no contiene a la varianza. Esto se debe a que n es demasiado grande. 
Este resultado intervalo es util para n no muy grande.