In [None]:
import pandas as pd
import numpy as np
import os as os
# Las sigs librerias son para el GDA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

 ## 1. Ingestión de datos 
- ¿Cuántas variables tenemos?
- ¿Cuántas observaciones tenemos?
- ¿Cuántas observaciones únicas tenemos por variable?
- ¿Cuántas variables numéricas tenemos?
- ¿Cuántas variables de fecha tenemos?
- ¿Cuántas variables categóricas tenemos?
- ¿Cuántas variables de texto tenemos?

In [None]:
# Path to data file
path_to_datafile = '/home/bj/Documents/IntroDataScience/Tareas/eda_hw/Data'
# Name of data file
datafile_name = 'consumo-agua.csv'

In [None]:
def load_df(name_data, path_to_data = os.getcwd(), separator = ','):
    """
    This function loads the data file
    
    param:  name_data is a string with the name of the file we want to load
            path_to_data is a string with the path to where name_data is. Default is the current working dir
            sep is a string to indicate the separator. Default is ','
    return: data_frame with the information in the file name_data
    """
    return pd.read_csv(path_to_data + '/' + name_data, sep = separator)

In [None]:
consumo_agua = load_df(datafile_name, path_to_datafile, ';')

In [None]:
def df_variables_info(df):
    """
    This function gives information about the df: number of observations, variables, type and number of variables
    
    params: df is a dataframe from which we want the information
    return: NONE???!!!!
    """
    # Number of variables
    print('Number of variables:', df.shape[1])
    
    #  Number of observations
    print('Number of observations:', df.shape[0])
    
    # Summary of the type of variables in the dataframe
    print('Type of variables in the dataframe: \n', df.dtypes.value_counts())
    
    return None


In [None]:
df_variables_info(consumo_agua)

In [None]:
df = consumo_agua

# Identify numeric variables in the df  
numeric_variables = df.select_dtypes(include = 'number').columns.values 
print('Numeric variables:', numeric_variables)
    
# Identify categorical variables in the df  
categorical_variables = df.select_dtypes(include = 'category').columns.values   
print('Categorical variables:', categorical_variables)
    
# Identify date/time variables in the df  
dates_variables = df.select_dtypes(include = 'datetime').columns.values   
print('Date/Time variables:', dates_variables)

# Identify string variables in the df  
string_variables = df.select_dtypes(include = 'object').columns.values   
print('String variables:', string_variables)    

In [None]:
consumo_agua.head()
consumo_agua.columns

## 2. Estandarización de nombres de variables.

- Transformar los nombres de las variables a formato estándar: minúsculas, sin espacios en blanco, sin signos de puntuación.

Transformar los nombres de las variables a formato estándar: minúsculas, sin espacios en blanco, sin signos de puntuación.

In [None]:
def clean_variables_names(data_frame):
    """
    Clean the name of the columns for the data_frame: lowercase, no spaces, no accent marks
    
    param: data_frame to clean column names
    return: data_frame with clean colum names
    """
    
    data_frame.columns = data_frame.columns.str.strip().str.lower().str.replace(' ','_')\
                       .str.replace('á','a').str.replace('é','e').str.replace('í','i')\
                       .str.replace('ó','o').str.replace('ú','u').str.replace('ñ','n')
    return data_frame

In [None]:
clean_consumo_agua = clean_variables_names(consumo_agua)

In [None]:
clean_consumo_agua.columns

In [None]:
clean_consumo_agua.head()

## 3. Transform data
Tipos, estandarización de texto, imputaciones, eliminación de observaciones o variables
- Agregar la variable latitud y longitud.
- Pasar la variable latitud y longitud a numérica -si no la tomó como numérica-.
- Eliminar la columna geo_point -una vez que creaste la variable latitud y longitud.
- Eliminar la columna geo_shape.
- Cambiar a minúsculas las columnas alcaldía, colonia e indice_des.
- Volver a correr el proceso de identificación de variables numéricas, categóricas, texto y fechas.

Agregar la variable latitud y longitud.
Pasar la variable latitud y longitud a numérica -si no la tomó como numérica-.

In [None]:
def split_string_column_into_numeric_columns(df, column_name, separator, new_cols_name):
    """
    This function splits the value in a column data frame that has string values into a separate numeric columns 
    
    params: df_name is a dataframe that contains the column we want to split
            colum_name is a string with the name of the column we want to split
            separtor is a string that contains the separator
            new_cols_name is a string list that contains the names for the splitted cols
    
    returns: df with the original column and the new columns named as in new_cols_name
    """
    #Split the column
    df[new_cols_name] =  df.loc[:, column_name ].str.split(separator, expand = True)
    
    for name in new_cols_name:
        df[name] = pd.to_numeric(df[name])
    
    return df

In [None]:
clean_consumo_agua = split_string_column_into_numeric_columns(clean_consumo_agua, 'geo_point', ',',
                                                              ['latitud', 'longitud'])

In [None]:
clean_consumo_agua.head()

In [None]:
df_variables_info(clean_consumo_agua)

Eliminar la columna geo_point -una vez que creaste la variable latitud y longitud.
Eliminar la columna geo_shape.

In [None]:
clean_consumo_agua.drop(['geo_point','geo_shape'], axis = 1, inplace = True)

In [None]:
df_variables_info(clean_consumo_agua)

Cambiar a minúsculas las columnas 'alcaldia', 'colonia' e 'indice_des'.

In [None]:
def clean_variables_values(df, columns_names):
    """
    Clean the values of the selected columns: lowercase, no spaces, no accent marks
    
    param: df is a dataframe where the variables to clean are
           colum_names is a list with the name of the variables we are going to clean
    return: df with clean colums
    """
    for name in columns_names:
        df[name] = df[name].str.strip().str.lower().str.replace('.','').str.replace(',','')\
                   .str.replace(';','').str.replace(' ','_').str.replace('á','a').str.replace('é','e')\
                   .str.replace('í','i').str.replace('ó','o').str.replace('ú','u').str.replace('ñ','n')
    
    return df

In [None]:
final_consumo_agua = clean_variables_values(clean_consumo_agua, ['nomgeo','alcaldia', 'colonia', 'indice_des'])

In [None]:
final_consumo_agua.head()

Volver a correr el proceso de identificación de variables numéricas, categóricas, texto y fechas.

In [None]:
df_variables_info(final_consumo_agua)

In [None]:
df = final_consumo_agua

# Identify numeric variables in the df  
numeric_variables = df.select_dtypes(include = 'number').columns.values 
print('Numeric variables:', numeric_variables)

# Identify categorical variables in the df  
categorical_variables = df.select_dtypes(include = 'category').columns.values   
print('Categorical variables:', categorical_variables)

# Identify date/time variables in the df  
dates_variables = df.select_dtypes(include = 'datetime').columns.values   
print('Date/Time variables:', dates_variables)

# Identify string variables in the df  
string_variables = df.select_dtypes(include = 'object').columns.values   
print('String variables:', string_variables)    

## 4. Analizar data (EDA, GDA)

### Data profiling

#### Para **variables numéricas**:

- Tipo de dato: float, integer
- Número de observaciones
- Mean
- Desviación estándar
- Cuartiles: 25%, 50%, 75%
- Valor máximo
- Valor mínimo
- Número de observaciones únicos
- Top 5 observaciones repetidas
- Número de observaciones con valores faltantes
- ¿Hay redondeos?

In [None]:
def data_profiling_num_vars(df, col_name):
    """
    This function computes basic statistics for the numerical column col_name in the dataframe df
    
    params: df a dataframe that contains the column col_name
            col_name a string with the name of the numerical variable from which we want to compute the statistics
    returns: dic a dictionary with the name and values of the basic statistics
    """
    top = df[col_name].value_counts().head(3).index.tolist()
    #Check all lists have the same lenght
    if len(top) != 3:
        if len(top) == 2:
            top.extend(['-'])
        else:
            top.extend(['-', '-'])
    
    dic = {'n_observations': df[col_name].count(),
           'max': df[col_name].max(),
           'min': df[col_name].min(),
           'mean': df[col_name].mean(),
           'std_dev': df[col_name].std(),
           '25%': df[col_name].quantile(0.25),
           'median': df[col_name].median(),
           '75%': df[col_name].quantile(0.75),
           'kurtosis': df[col_name].kurtosis(),
           'skewness': df[col_name].skew(),
           'n_unique_values': df[col_name].nunique(),
           '%_missings': df[col_name].isna().mean()*100,
           'Top1_most_common': top[0],
           'Top2_most_common': top[1],
           'Top3_most_common': top[2]
          }
           
    return dic

In [None]:
def descriptive_stats_for_numeric_vars(df, num_vars):
    """
    Show basic descriptive statistics for the numerical variables in the dataframe df
    
    param: df is a dataframe with numerical variables
           num_vars is a list with the names of the numerical variables in the df
           
    return: stat_table is a dataframe with the statistics for each numerical variable in df
    """
    stats_table = pd.DataFrame([data_profiling_num_vars(df, cols) for cols in num_vars]).transpose()
    stats_table.columns = num_vars
    
    return stats_table


In [None]:
descriptive_stats_for_numeric_vars(final_consumo_agua, numeric_variables)

In [None]:
def descriptive_stats_for_numeric_vars2(df, num_vars):
    """
    Show basic descriptive statistics for the numerical variables in the dataframe df
    
    param: df is a dataframe with numerical variables
           num_vars is a list with the names of the numerical variables in the df
           
    return: stat_table is a dataframe with the statistics for each numerical variable in df
    """
    stats_table1 = df[num_vars].describe().reindex(['count','max', 'min', 'mean', 'std', '25%', '50%', '75%'])
    stats_table2 = df[num_vars].agg(['kurtosis', 'skew', 'nunique'], axis = 0)
    stats_table = stats_table1.append(stats_table2)
    stats_table = stats_table.rename(index = {'count': 'n_observations','nunique': 'unique_values', '50%': 'median'})
    
    t = pd.DataFrame(columns = num_vars, index = ['%_missings','Top1_most_common','Top2_most_common','Top3_most_common'])
    
    for var in num_vars:
        #Compute proportion of missing values
        prop_missings = df[var].isna().mean()*100
        #Compute top3 most common values
        top = df[var].value_counts().head(3).index.tolist()
        #Check all lists have the same lenght
        if len(top) != 3:
            if len(top) == 2:
                top.extend(['-'])
            else:
                top.extend(['-', '-'])
        col = [prop_missings, top[0], top[1], top[2]]
        #print(col)
        t[var] = col
    
    
    stats_table = stats_table.append(t)
    
    return stats_table.round(5)

### Análisis gráfico para variables numéricas

In [None]:
def plot_missing_values(df):
    """
    This function shows the missing values for all the variables in df
    """
    sns.heatmap(df.isnull(), cbar=False)
    

In [None]:
#plot_missing_values(final_consumo_agua)

Las variables numéricas *consumo_total_mixto* y *consumo_prom_mixto* tienen aproximadamente un 12% de datos faltantes y de la gráfica anterior podemos observar que estos datos faltantes son similares en ambas variables. Lo mismo pasa para las columnas *consumo_total_dom* y *consumo_prom_dom* que tienen un 7% de datos faltantes y dichos datos faltantes se encuentran en las mismas observaciones.

In [None]:
def plot_histogram(df, num_vars):
    """
    This function plots the histogram fo each of the variables in num_vars
    
    params: df a dataframe that have numerical variables
            num_vars the list of the numerical variables in df
    returns: histograms
    """
    for var in num_vars:
        plt.figure(figsize=(8,5))
        hist = sns.distplot(df[var].dropna(), kde = False)
        titulo_graf = "Histograma " + var
        hist.set_title(titulo_graf)
        hist.set_ylabel("Frecuencia")
        plt.savefig(titulo_graf + ".png")
        plt.show()

In [None]:
#plot_histogram(final_consumo_agua, numeric_variables)

De los histogramas podemos ver que las variable numericas tienen muchos valores atípicos. También notamos que la variable *bimestre* no es una variable numérica sino categórica, así como la variable *gid*.

Por otro lado, las variables longitud y latitud estan dentro de un rango razonable para las coordenadas de la Ciudad de México. 

Coordenadas geográficas de Ciudad de México, [link][1] en grados decimales:

        Longitud: -99.1276600
        Latitud: 19.4284700

[1]: https://embamex.sre.gob.mx/eua/index.php/es/enterate/391-acerca-de-mexico "SRE"

In [None]:
#Delete variables: anio, gid, bimestre
numeric_variables = np.delete(numeric_variables, [1, 8, 10])
print(numeric_variables)

In [None]:
def boxplot_all(df, num_vars):
    """
    This function plots the boxplot for all the variables in num_vars in one graph
    
    params: df a dataframe that have numerical variables
            num_vars the list of the numerical variables in df
    returns: one graph with multiple boxplots
    """
    
    sns.set(style="whitegrid", palette="Blues")
    
    plt.figure(figsize=(10,8))
    sns.boxplot(y="variable", x="value", data=pd.melt(df[num_vars].dropna()), palette = 'Blues')
    
    plt.show()
     

In [None]:
boxplot_all( final_consumo_agua, ['consumo_total_mixto', 'consumo_total_dom', 'consumo_total_no_dom'])

In [None]:
#boxplot_all(final_consumo_agua, ['consumo_prom_mixto', 'consumo_prom_no_dom', 'consumo_prom_dom'])

In [None]:
def plot_by_pairs_grid(df, num_vars):
    """
    This function plots the a grid of small subplots using scatter plots. Each row and column is assigned to a 
    different variable, so the resulting plot shows each pairwise relationship for all the variables in num_vars
    
    params: df a dataframe that have numerical variables
            num_vars the list of the numerical variables in df
    returns: one graph with multiple subplots
    """
    g = sns.PairGrid(df[num_vars].dropna())
    g.map_diag(plt.hist)
    g.map_offdiag(plt.scatter)
    

In [None]:
#plot_by_pairs_grid(final_consumo_agua, ['consumo_total_mixto', 'consumo_total_dom', 'consumo_total_no_dom'])

#plot_by_pairs_grid(final_consumo_agua, ['consumo_prom_dom', 'consumo_prom_mixto'])
#g = sns.pairplot(final_consumo_agua[['consumo_prom_dom', 'consumo_prom_mixto']].dropna())
#g.map(plt.scatter)

In [None]:
def plot_heatmap_corr(df, num_vars):
    """
    This function plots a Heatmap of correlation matrix for numerical variables. When plotting a correlation matrix
    any non-numeric column is ignores. Categorical variables can be changed to numeric variables.
    
    
    sns.heatmap(df[num_vars].corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', mask = mask_matrix)


In [None]:
#plot_heatmap_corr(final_consumo_agua, numeric_variables)

In [None]:
# filling a null values using fillna()  
#data["Gender"].fillna("No Gender", inplace = True)  

#### Para variables categoricas

- Número de categorías
- Valor de las categorías
- Moda
- Histograma
- Valores faltantes
- Número de observaciones con valores faltantes
- Proporción de observaciones por categoría
- Faltas de ortografía ?

In [None]:
def data_profiling_categorical_vars(df, col_name):
    """
    Show basic descriptive statistics for othe categorical variable col_name in the dataframe df
    
    param: df is a dataframe with the column col_name
           col_name is a string with the name of the categorical variable in the df
           
    return: dic a dictionary with name and value of the statistics computed
    """
    top = df[col_name].value_counts().head(3).index.tolist()
    #Check all lists have the same lenght
    if len(top) != 3:
        if len(top) == 2:
            top.extend(['-'])
        else:
            top.extend(['-', '-'])
    
    dic = {'n_observations': df[col_name].count(),
           'mode': df[col_name].mode()[0],
           'num_categories': len(df[col_name].unique()),
           'categories': df[col_name].unique(),
           'n_unique_values': df[col_name].nunique(),
           'n_missings': df[col_name].isna().sum(),
           '%_missings': df[col_name].isna().mean()*100,
           'Top1_most_common': top[0],
           'Top2_most_common': top[1],
           'Top3_most_common': top[2]
          }
    return dic
    

In [None]:
def descriptive_stats_for_categorical_vars(df, cat_vars):
    """
    Show basic descriptive statistics for the categorical variables in the dataframe df
    
    param: df is a dataframe with categorical variables
           cat_vars is a list of the categorical variables in the df
           
    return: cat_table is a dataframe with the statistics for each categorical variable in df
    """
    
    stats_table = pd.DataFrame([data_profiling_categorical_vars(df, cols) for cols in cat_vars]).transpose()
    stats_table.columns = cat_vars
    
    return stats_table

In [None]:
descriptive_stats_for_categorical_vars(final_consumo_agua, string_variables)
#final_consumo_agua['alcaldia'].mode()

In [None]:
def descriptive_stats_for_categorical_vars2(df, cat_vars):
    """
    Show basic descriptive statistics for the categorical variables in the dataframe df
    
    param: df is a dataframe with categorical variables
           cat_vars is a list of the categorical variables in the df
           
    return: cat_table is a dataframe with the statistics for each categorical variable in df
    """
    stats_table = df[cat_vars].describe().reindex(['count', 'top'])
    stats_table = stats_table.rename(index = {'count': 'n_observations','top': 'mode'})
    
    t = pd.DataFrame(columns = cat_vars, index = ['n_categories', 'categories','n_missings','Top1_most_common','Top2_most_common','Top3_most_common'])
    for var in cat_vars:
        #Compute the number of missing values
        n_missings = df[var].isna().sum()
        #List the categories
        cat_list = df[var].unique()
        n_cat = len(cat_list)
        #Compute top3 most common values
        top = df[var].value_counts().head(3).index.tolist()
        #Check all lists have the same lenght
        if len(top) != 3:
            if len(top) == 2:
                top.extend(['-'])
            else:
                top.extend(['-', '-'])
        col = [n_cat, cat_list, n_missings, top[0], top[1], top[2]]
        #print(col)
        t[var] = col
    
    stats_table = stats_table.append(t)
    
    return stats_table
    

Coordenadas geoespaciales:
    
- Primero se pone la latitud y luego la longitud
- Primer decimal: 111 kms
- Segundo decimal: 11.1 kms
- Tercer decimal: 1.1 kms
- Cuarto decimal: 11 mts
- Quinto decimal: 1.1 mt
- Sexto decimal: 0.11 mts
- Valores que están cercanos al 100 representan la longitud
- El símbolo en cada coordenada representa si estamos al norte (positivo) o sur (negativo) -en la latitud-, al este (positivo) o al oeste (negativo) -en la longitud-.

identificar errores en los datos
conocer los datos
establecer hipotesis
entender el negocio


**Notas aclaratorias**
Tipo de emisión:
- consumo medido: consumo medido con medidor
- consumo_prom: Consumo promedio considerado como
    - A) A falta de aparato medidor, corresponde a la colonia catastral en la que se encuentra el inmueble (siempre que en dicha colonia se mida el 70% de las tomas de agua)
    - B) Por descompostura del aparato, se tomará el consumo promedio de los últimos 6 bimestres, quedando fuera el bimestre más alto

- indice_des: indice de desarrollo construido estadisticamente con datos socioeconomicos

### Análisis gráfico para variables categóricas

In [None]:
def graphics_for_categorical_vars(df, cat_vars):
    """
    Show basic graphics for the categorical variables in the dataframe df
    
    param: df is a dataframe with categorical variables
           cat_vars is a list of the categorical variables in the df
           
    return: NONE 
    """
    
    for i in range(len(cat_vars)):
        # Plot bar chart
        df[cat_vars[i]].value_counts().plot(kind = 'barh', figsize = (8, 6))
        plt.xlabel("Conteo", labelpad = 14)
        plt.ylabel(cat_vars[i], labelpad = 14)
        titulo_graf = "Grafica frecuencia " + cat_vars[i]
        plt.title(titulo_graf, y = 1.02)
        plt.savefig(titulo_graf + ".png")
        plt.show()



In [None]:
#graphics_for_categorical_vars(final_consumo_agua, string_variables)

# Algunas preguntas:

- En que bimestre/s disminuye el consumo de agua?
- Como esta el consumo de agua relacionado con el indice_des?
- Como esta el consumo de agua relacionado con el indice_des y la colonia/alcaldia?
- Consumo de agua por alcaldia/colonia
- Quienes consumen mas agua? domicilios, negocios? que otros?

In [None]:
def plot_bivariate_hist(df, var1, var2):
    """
    """
    sns.distplot( df[var1] , color="skyblue", label="Sepal Length")
    sns.distplot( df[var2] , color="red", label="Sepal Width")
    #sns.plt.legend()

In [None]:
plot_bivariate_hist(final_consumo_agua, 'consumo_total', 'consumo_prom')

In [None]:
def barplot_by_category(df, var, cat):
    """
    This function shows a barplot for a numerical variable var by categories in cat
    """
    sns.set(style="whitegrid", palette = 'deep')
    
    plt.figure(figsize=(10,8))
        
    x_data = df.groupby(cat)[var].mean().sort_values(ascending = False)
    titulo_graf = "Promedio de " + str(var) + " por " + str(cat) 
    
    if (len(x_data)>20):
        x_data = x_data[0:19]
        titulo_graf = titulo_graf + " (Top 20)"
        
    x_plot = sns.barplot(x=x_data.values, y=x_data.index, data=df)
    x_plot.set_title(titulo_graf)
    plt.show()

In [None]:
final_consumo_agua.groupby('alcaldia')['consumo_total'].mean().sort_values(ascending = False)

#barplot_by_category(final_consumo_agua, 'consumo_total', ['alcaldia', 'colonia'])


In [None]:
barplot_by_category(final_consumo_agua, 'consumo_total', 'indice_des')

# 5. Backward selection

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

In [None]:
print(numeric_variables, string_variables)


In [None]:
y_output = final_consumo_agua['consumo_total']
x_regressors = final_consumo_agua.loc[:, final_consumo_agua.columns != 'consumo_total'].copy()
#x_regressors.head()

In [None]:
# Imputar variables numericas con la "media"
#numeric_variables = x_regressors.select_dtypes(include = 'number').columns.values
# 
numeric_variables = ['consumo_total_mixto', 'consumo_prom_dom',
       'consumo_total_dom', 'consumo_prom_mixto', 'consumo_prom',
       'consumo_prom_no_dom', 'consumo_total_no_dom', 'latitud', 'longitud']

#x_regressors.loc[:, numeric_variables] = vars_imputed 


In [None]:
def mean_imputation_for_numeric_vars(df, num_vars):
    """
    This function imputes missing values with the mean
    
    param: df dataframe containing the numeric variables
           num_vars a list with the names of numeric variables
    
    return: vars_imputed numeric variables with no missing values
    """
    aux = df[num_vars].values.reshape(df.shape[0],len(num_vars))
    vars_imputer = SimpleImputer(strategy="mean")
    vars_imputer.fit(aux)
    vars_imputed = vars_imputer.transform(aux)
    
    return vars_imputed

In [None]:
imputed_vars = mean_imputation_for_numeric_vars(x_regressors, numeric_variables)
# Check I have no missings
#descriptive_stats_for_numeric_vars(x_regressors, numeric_variables)

In [None]:
def standardization_for_numeric_vars(num_vars):
    """
    This function imputes missing values with the mean
    
    param: num_vars a list with the names of numeric variables
    
    return: vars_standardized numeric variables standardized
    """
    ## Instanciando el estimador StandardScaler con los valores por default
    vars_standardization = StandardScaler()
    ## Aplicando el estimador a los datos
    vars_standardization.fit(num_vars)
    vars_standardized = vars_standardization.transform(num_vars)

    return vars_standardized


In [None]:
# Escalando los datos 
standardized_vars = standardization_for_numeric_vars(imputed_vars)

#Asignamos nuevamente al df
x_regressors.loc[:, numeric_variables] = vars_standardized 

In [None]:
x_regressors.head()
#descriptive_stats_for_numeric_vars(x_regressors, numeric_variables)

In [None]:
# hot encoding
def transforma_cat_dummies(df,cat_vars):
    df=pd.get_dummies(df[cat_vars], columns = cat_vars, drop_first = True)
    #df=pd.get_dummies(df, columns = cat_vars, drop_first = True)
    return df

In [None]:
string_variables = x_regressors.select_dtypes(include = 'object').columns.values
print(string_variables)

In [None]:
#Hot encoding for categorical variables
dummies = transforma_cat_dummies(x_regressors, ['alcaldia', 'indice_des','bimestre'])

In [None]:
# Elimino las variables categoricas y anado las dummies
x_regressors = x_regressors.drop(string_variables, axis = 1)
x_regressors = x_regressors.drop(['bimestre', 'anio', 'gid'], axis = 1)


x_regressors = pd.concat([x_regressors, dummies], axis = 1)

In [None]:
x_regressors.head()

In [None]:
import statsmodels.api as sm

In [None]:
x_regressors = sm.add_constant(x_regressors)
model = sm.OLS(y_output, x_regressors)

In [None]:
results = model.fit()
print(results.summary())

In [None]:
pVals = results.pvalues
#print(pVals)

P_VALUE_THRESHOLD = 0.05

no_significativa = pVals[pVals > P_VALUE_THRESHOLD]
index = no_significativa.idxmax()

x_regressors = x_regressors.drop(index, axis = 1)

In [None]:
def backward_selection(y, x, pVal_threshold):
    """
    """
    x = sm.add_constant(x)
    model = sm.OLS(y, x)
    results = model.fit()
    p_vals = results.pvalues
    
    drop_index = p_vals.idxmax()

    while (p_vals[drop_index] > pVal_threshold):
        x = x.drop(drop_index, axis = 1)
        
        model = sm.OLS(y, x)
        results = model.fit()
        p_vals = results.pvalues
        
        drop_index = p_vals.idxmax()
        
    return x

In [None]:
backward_selection(y_output, x_regressors, P_VALUE_THRESHOLD)
