In [1]:
import numpy as np
import pandas as pd

In [2]:
Data = pd.read_csv('House_Price_Regression.csv')

In [3]:
Data.head()

Unnamed: 0,neighborhood_recode,latitude,longitude,price,no_of_bedrooms,no_of_bathrooms,quality_recode,maid_room_recode,unfurnished_recode,balcony_recode,...,private_garden_recode,private_gym_recode,private_jacuzzi_recode,private_pool_recode,security_recode,shared_gym_recode,shared_pool_recode,shared_spa_recode,view_of_water_recode,size_in_m_2
0,46.0,25.113208,55.138932,2700000,1,2,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,100.242337
1,46.0,25.106809,55.151201,2850000,2,2,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,146.972546
2,36.0,25.063302,55.137728,1150000,3,5,2.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,181.253753
3,11.0,25.227295,55.341761,2850000,2,3,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187.66406
4,46.0,25.114275,55.139764,1729200,0,1,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,47.101821


## Intervalo de confianza para la proporción

### Intervalo para la variable Balcony y para Quality

In [4]:
import scipy

In [5]:
def CI_Proportion(Variable , alpha=0.05):

    z_alpha_medios = scipy.stats.norm.ppf( 1 - alpha/2 , loc=0, scale=1)

    X_mean = Variable.mean()

    n = len(Variable)

    L1 = X_mean - z_alpha_medios * np.sqrt(X_mean*(1-X_mean)/n)

    L2 = X_mean + z_alpha_medios * np.sqrt(X_mean*(1-X_mean)/n)

    interval = [L1 , L2]

    return interval , X_mean

In [6]:
Balcony = Data.balcony_recode # 1 = true , 0 = false

In [7]:
intervalo , proporcion = CI_Proportion(Variable=Balcony , alpha=0.05)

In [8]:
intervalo

[0.6995155465832295, 0.7398545321569281]

In [9]:
proporcion

0.7196850393700788

In [10]:
Quality_0 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,0] # 1 = quality low , 0 = not low
Quality_1 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,1] # 1 = quality medium , 0 = not medium
Quality_2 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,2] # 1 = quality high , 0 = not high
Quality_3 = pd.get_dummies(Data.quality_recode , drop_first=False).iloc[:,3] # 1 = quality ultra , 0 = not ultra

In [11]:
intervalo , proporcion = CI_Proportion(Variable=Quality_0 , alpha=0.05)

In [12]:
intervalo

[0.05885789175110899, 0.08182452294705374]

In [13]:
proporcion

0.07034120734908136

In [14]:
intervalo , proporcion = CI_Proportion(Variable=Quality_1 , alpha=0.05)

In [15]:
intervalo

[0.26528121372661256, 0.30584739519727194]

In [16]:
proporcion

0.28556430446194225

In [17]:
intervalo , proporcion = CI_Proportion(Variable=Quality_2 , alpha=0.05)

In [18]:
intervalo

[0.5795901905178868, 0.6235594157813258]

In [19]:
proporcion

0.6015748031496063

In [20]:
intervalo , proporcion = CI_Proportion(Variable=Quality_3 , alpha=0.05)

In [21]:
intervalo

[0.03345901015224513, 0.051580359926495026]

In [22]:
proporcion

0.04251968503937008

### Intervalo de confianza para la media con varianza desconocida

In [23]:
def CI_Mean(Variable , alpha=0.05):

    n = len(Variable)

    t_alpha_medios = scipy.stats.t.ppf( 1 - alpha/2 , df=n-1)

    X_mean = Variable.mean()

    X_cuasi_var = Variable.std()**2 
    
    # std() esta definida por defecto como la cuasi-desviacion-tipica

    L1 = X_mean - t_alpha_medios * np.sqrt(X_cuasi_var/n)

    L2 = X_mean + t_alpha_medios * np.sqrt(X_cuasi_var/n)

    interval = [L1 , L2]

    return interval , X_mean

In [24]:
Price = Data.price

In [25]:
intervalo , media = CI_Mean(Variable=Price , alpha=0.05)

In [26]:
intervalo

[1954927.6388481215, 2216732.108133506]

In [27]:
media

2085829.8734908137

### Intervalo de confianza para la varianza

In [28]:
def CI_Variance(Variable , alpha=0.05):

    n = len(Variable)

    chi_alpha_medios = scipy.stats.chi.ppf( 1 - alpha/2 , df=n-1)

    chi_1_alpha_medios = scipy.stats.chi.ppf(alpha/2 , df=n-1)

    X_cuasi_var = Variable.std()**2 

    X_var = ( (n-1)/n )*X_cuasi_var
    
    # std() esta definida por defecto como la cuasi-desviacion-tipica

    L1 =  (n*X_var) / chi_alpha_medios

    L2 = (n*X_var) / chi_1_alpha_medios

    interval = [L1 , L2]

    return interval , X_var 

In [29]:
Price.std()**2

8486734015707.785

In [30]:
intervalo , varianza = CI_Variance(Variable=Price , alpha=0.05)

In [31]:
intervalo

[358920965766270.5, 382466627578853.0]

In [32]:
varianza

8482279037221.85

El intervalo no contiene a la varianza. Esto se debe a que n es demasiado grande. 
Este resultado intervalo es util para n no muy grande.

### Intervalo de confianza para la diferencia de medias 

In [39]:
def CI_Mean_Diference(Variable1 , Variable2 , alpha=0.05):

    X1 = Variable1
    X2 = Variable2

    n1 = len(X1) 
    n2 = len(X2)   

    X1_mean = X1.mean()
    X2_mean = X2.mean()

    X1_cuasi_var = X1.std()**2 
    X2_cuasi_var = X2.std()**2 

    X1_var = ( (n1-1)/n1 )*X1_cuasi_var
    X2_var = ( (n2-1)/n2 )*X2_cuasi_var

    X_var_p = ((n1-1)*X1_var + (n2-1)*X2_var )/(n1+n2-2)

    v = ( X1_var/n1 + X2_var/n2 )**2 / ( (X1_var/n1)**2 / (n1-1)  + (X2_var/n2)**2 / (n2-1)  )

    t_alpha_medios = scipy.stats.chi.ppf( 1 - alpha/2 , df=v)
  

    L1 =  (X1_mean - X2_mean) - t_alpha_medios * np.sqrt(X_var_p * (1/n1 + 1/n2))

    L2 =  (X1_mean - X2_mean) + t_alpha_medios * np.sqrt(X_var_p * (1/n1 + 1/n2))

    interval = [L1 , L2]

    return interval , (X1_mean - X2_mean) 

In [37]:
Price_Qualiti_0 = Data.loc[ Data.quality_recode == 0 , 'price']

Price_Qualiti_3 = Data.loc[ Data.quality_recode == 3 , 'price']

In [41]:
intervalo , diferencia_medias = CI_Mean_Diference(Price_Qualiti_0 , Price_Qualiti_3 , alpha=0.05)

In [42]:
intervalo

[-5255025.492552606, 8793827.719749954]

In [43]:
diferencia_medias

1769401.1135986734

### Intervalo confianza para la diferencia de medias con muestras pareadas

In [49]:
def CI_Mean_Diference_Paired(Variable1 , Variable2 , alpha=0.05):

    X1 = Variable1
    X2 = Variable2

    D = X1-X2

    n = len(D) 

    D_mean = D.mean()
 
    D_cuasi_var = D.std()**2 
 
    D_var = ( (n-1)/n )*D_cuasi_var
 
    t_alpha_medios = scipy.stats.chi.ppf( 1 - alpha/2 , df=n-1)

    L1 = D_mean - t_alpha_medios * np.sqrt((n/(n-1))*D_var/n)

    L2 = D_mean + t_alpha_medios * np.sqrt((n/(n-1))*D_var/n)

    interval = [L1 , L2]

    return interval , D_mean

In [51]:
Nota_Examen_1 = np.array([4,6,6,7.5,6,3,5,6,8,5])

Nota_Examen_2 = np.array([5,7,6,7,8,4,5,6,7,9])

In [52]:
CI_Mean_Diference_Paired(Nota_Examen_1 , Nota_Examen_2 , alpha=0.05)

([-2.632283321526545, 1.1322833215265453], -0.75)

### Intervalo de confianza para el cociente de varianzas

In [55]:
def CI_Variance_Quotient(Variable1, Variable2, alpha=0.05):

    X1 = Variable1
    X2 = Variable2

    n1 = len(X1)
    n2 = len(X2)

    X1_cuasi_var = X1.std()**2 
    X2_cuasi_var = X2.std()**2 

    X1_var = ( (n1-1)/n1 )*X1_cuasi_var
    X2_var = ( (n2-1)/n2 )*X2_cuasi_var

    F_alpha_medios = scipy.stats.f.ppf( 1 - alpha/2 , dfn=n1, dfd=n2 )

    F_1_alpha_medios = scipy.stats.f.ppf( alpha/2 ,  dfn=n1, dfd=n2 )

    L1 =  (X1_var/X2_var)*F_1_alpha_medios

    L2 = (X1_var/X2_var)*F_alpha_medios

    interval = [L1 , L2]

    return interval , (X1_var/X2_var) 

In [57]:
intervalo , cociente_varianzas = CI_Variance_Quotient(Price_Qualiti_0 , Price_Qualiti_3, alpha=0.05)

In [58]:
intervalo

[103.38012135091866, 226.6148066572665]

In [59]:
cociente_varianzas

151.59794710463066

### Intervalo de confianza para la diferencia de proporciones

In [60]:
def CI_Proportion_Diference(Variable1, Variable2, alpha=0.05):

    X1 = Variable1
    X2 = Variable2

    X1_mean = X1.mean()
    X2_mean = X2.mean()

    n1 = len(X1)
    n2 = len(X2)

    z_alpha_medios = scipy.stats.norm.ppf( 1 - alpha/2, loc=0, scale=1)


    L1 =  (X1_mean-X2_mean) - z_alpha_medios*np.sqrt(X1_mean*(1-X1_mean)/n1 + X2_mean*(1-X2_mean)/n2)

    L2 =  (X1_mean-X2_mean) + z_alpha_medios*np.sqrt(X1_mean*(1-X1_mean)/n1 + X2_mean*(1-X2_mean)/n2)

    interval = [L1 , L2]

    return interval , (X1_mean-X2_mean)

In [70]:
intervalo , diferencia_proporciones = CI_Proportion_Diference(Quality_0, Quality_2, alpha=0.05)

In [71]:
intervalo

[-0.5560366143605591, -0.5064305772404909]

In [72]:
diferencia_proporciones

-0.531233595800525