# Factor de inflación de la varianza (VIF)

In [1]:
%pylab
%matplotlib inline

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


## Importación de variables

In [2]:
import pandas as pd

wine = pd.read_csv('winequality-red.csv', sep = ';')
wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


## Obtención del valor del VIF

Vamos a calcular el valor del VIF para todas las variables menos la objetivo. Para esto se realiza una regresión lineal de cada una de las variables frente al resto y aplicamos la fórmula del VIF
$$
    VIF_i = \frac{1}{1 - R_i^2}
$$

In [3]:
from sklearn.linear_model import LinearRegression

# Borrado de la variable objetivo
wine_vif = wine.copy(deep = True)
features = list(wine_vif.columns)
features.remove('quality')
wine_vif = wine_vif[features]

for i in range(len(features)):
    var = features[i]
    fet = features[:]
    fet.remove(var)
    
    x = wine_vif[fet]
    y = wine[var]
    
    model = LinearRegression()
    model.fit(x, y)
    
    vif = 1 / (1 - model.score(x, y))
    
    print 'El valor del VIF para la variable', var, 'es:', vif

El valor del VIF para la variable fixed acidity es: 7.76751156571
El valor del VIF para la variable volatile acidity es: 1.78938968223
El valor del VIF para la variable citric acid es: 3.12802212436
El valor del VIF para la variable residual sugar es: 1.70258790815
El valor del VIF para la variable chlorides es: 1.4819323298
El valor del VIF para la variable free sulfur dioxide es: 1.96301914148
El valor del VIF para la variable total sulfur dioxide es: 2.18681278859
El valor del VIF para la variable density es: 6.34376018586
El valor del VIF para la variable pH es: 3.32973162358
El valor del VIF para la variable sulphates es: 1.42943374939
El valor del VIF para la variable alcohol es: 3.03115981988


En los resultados se puede observar que hay dos variables (`fixed acidity` y `density`) que tienen valores por encima de 5. Se ha de eliminar la que tiene el valor más alto y volve a ejecutar el proceso.

In [4]:
# Borrado de la variable objetivo
features = list(wine_vif.columns)
features.remove('fixed acidity')
wine_vif = wine_vif[features]

for i in range(len(features)):
    var = features[i]
    fet = features[:]
    fet.remove(var)
    
    x = wine_vif[fet]
    y = wine[var]
    
    model = LinearRegression()
    model.fit(x, y)
    
    vif = 1 / (1 - model.score(x, y))
    
    print 'El valor del VIF para la variable', var, 'es:', vif

El valor del VIF para la variable volatile acidity es: 1.78496308191
El valor del VIF para la variable citric acid es: 2.78055659725
El valor del VIF para la variable residual sugar es: 1.38637548021
El valor del VIF para la variable chlorides es: 1.40123158238
El valor del VIF para la variable free sulfur dioxide es: 1.93920892272
El valor del VIF para la variable total sulfur dioxide es: 2.06939644439
El valor del VIF para la variable density es: 2.43009645365
El valor del VIF para la variable pH es: 1.61077513136
El valor del VIF para la variable sulphates es: 1.39638176518
El valor del VIF para la variable alcohol es: 2.13606729055


Una vez eliminada la variable que presenta el VIF más alto el valor de este para resto de las variables cambia. Ahora podemos ver que ninguna de las variable restantes tiene un valor del VIF superior a cinco. Por lo que se ha eliminado las variables colineales de este conjunto de datos.

## Automatización del proceso de selección

In [5]:
def calculateVIF(data):
    features = list(data.columns)
    num_features = len(features)
    
    model = LinearRegression()
    
    result = pd.DataFrame(index = ['VIF'], columns = features)
    result = result.fillna(0)
    
    for ite in range(num_features):
        x_features = features[:]
        y_featue = features[ite]
        x_features.remove(y_featue)
        
        x = data[x_features]
        y = data[y_featue]
        
        model.fit(data[x_features], data[y_featue])
        
        result[y_featue] = 1/(1 - model.score(data[x_features], data[y_featue]))
    
    return result

wine_vif = wine.copy(deep = True)
features = list(wine_vif.columns)
features.remove('quality')
wine_vif = wine_vif[features]

calculateVIF(wine_vif)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
VIF,7.767512,1.78939,3.128022,1.702588,1.481932,1.963019,2.186813,6.34376,3.329732,1.429434,3.03116


In [6]:
def selectDataUsingVIF(data, max_VIF = 5):
    result = data.copy(deep = True)
    
    VIF = calculateVIF(result)
    
    while VIF.as_matrix().max() > max_VIF:
        col_max = np.where(VIF == VIF.as_matrix().max())[1][0]
        features = list(result.columns)
        features.remove(features[col_max])
        result = result[features]
        
        VIF = calculateVIF(result)
        
    return result

calculateVIF(selectDataUsingVIF(wine_vif))

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
VIF,1.784963,2.780557,1.386375,1.401232,1.939209,2.069396,2.430096,1.610775,1.396382,2.136067
