In [23]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from scipy import stats
from sklearn.linear_model import LinearRegression 
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler


# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [32]:
%store -r df_var_clean

In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [29]:

def calculate_vif(df):
    vif = pd.DataFrame()
    vif["Ratios"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]    
    return(vif)


In [31]:
vif = calculate_vif(df_var)
while vif['VIF'][vif['VIF'] > 7].any():
    remove = vif.sort_values('VIF',ascending=0)['Ratios'][:1]
    df_var.drop(remove,axis=1,inplace=True)
    vif = calculate_vif(df_var)
vif

Unnamed: 0,Ratios,VIF
0,IPC(%anual),1.378127
1,Inversion Es(millones€),2.065478
2,IED(millones€),3.684067
3,Saldo Balanza Pagos(millones€),2.543653


In [None]:
df_var_clean.drop(columns='Inversion Es(millones€)')

In [37]:
X= df_var_clean.drop(columns='Inversion Es(millones€)')

In [36]:
from sklearn.linear_model import LinearRegression


def calculateVIF(var_predictoras_df):
    var_pred_labels = list(var_predictoras_df.columns)
    num_var_pred = len(var_pred_labels)
    
    lr_model = LinearRegression()
    
    result = pd.DataFrame(index = ['VIF'], columns = var_pred_labels)
    result = result.fillna(0)
    
    for ite in range(num_var_pred):
        x_features = var_pred_labels[:]
        y_feature = var_pred_labels[ite]
        x_features.remove(y_feature)
        
        x = var_predictoras_df[x_features]
        y = var_predictoras_df[y_feature]
        
        lr_model.fit(var_predictoras_df[x_features], var_predictoras_df[y_feature])
        
        result[y_feature] = 1/(1 - lr_model.score(var_predictoras_df[x_features], var_predictoras_df[y_feature]))
    
    return result

In [39]:
calculateVIF(X).T

Unnamed: 0,VIF
IPC(%anual),8.118633
Desempleo (%),3.800923
IED(millones€),3.849045
Total de reservas(millones€),127.21826
Deuda Externa(millones€),55.552659
Tasa de interés(%),19.903107
Saldo Balanza Pagos(millones€),7.024821
Importaciones(millones€),1487.712844
Exportaciones(millones€),1926.146284
PIB(millones€),36.30871


In [40]:
def selectDataUsingVIF(var_predictoras_df, max_VIF = 5):
    result = var_predictoras_df.copy(deep = True)
    
    VIF = calculateVIF(result)
    
    while VIF.values.max() > max_VIF:
        col_max = np.where(VIF == VIF.values.max())[1][0]
        features = list(result.columns)
        features.remove(features[col_max])
        result = result[features]
        
        VIF = calculateVIF(result)
        
    return result

In [41]:
calculateVIF(selectDataUsingVIF(X)).T

Unnamed: 0,VIF
IPC(%anual),1.550145
Desempleo (%),1.225013
IED(millones€),2.337653
Deuda Externa(millones€),2.059763
Saldo Balanza Pagos(millones€),1.254622
