1. **Entendimiento de los datos**

In [None]:
# librerías
seed = 161
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Composicion de pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.model_selection import train_test_split

# Regresion lineal
from sklearn.linear_model import LinearRegression

# Importar/ Exportar modelos
from joblib import dump, load

# Metricas
from sklearn.metrics import mean_squared_error as mse

# q-q plots
import scipy.stats as stats


In [None]:
# desarrollamos el notebook en google colab y encontramos esta manera para poder importar facilmente el archivo y no tener que subirlo constantemente 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Se cargan los datos. 
data = pd.read_csv('/content/drive/MyDrive/202220_Laboratorio_3_data_university_admission_train.csv', index_col=0)
data_t = data.copy()


In [None]:
data_t.isnull().sum()

Serial No.            0
GRE Score             0
TOEFL Score           0
University Rating     0
SOP                   0
LOR                   0
CGPA                  0
Research              0
Admission Points     65
dtype: int64

In [None]:
#Se borran los nulos de la etiqueta
data_t =data_t.dropna()

In [None]:
#Se parten los datos en entrenamiento y test 
X = data_t.drop(columns = ['Admission Points'])
y = data_t[['Admission Points']]
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

In [None]:
y_train.describe()

Unnamed: 0,Admission Points
count,1128.0
mean,66.386472
std,19.281796
min,34.0
25%,47.78
50%,67.0
75%,84.0
max,145.5


In [None]:
X_train

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
1251,44,332,117,4,0.24,4.08,9.10,0
846,395,329,82,4,2.12,4.00,9.23,1
1287,190,324,112,5,5.00,5.00,8.55,1
1549,88,317,107,2,1.00,3.00,8.28,0
1461,170,311,99,2,2.50,3.00,7.98,0
...,...,...,...,...,...,...,...,...
1176,407,301,77,3,2.46,3.00,7.45,0
1349,167,302,82,3,2.32,4.00,8.33,0
897,339,311,108,5,4.00,4.00,8.74,1
1522,79,296,95,2,3.00,2.00,7.54,1


In [None]:
# Se crea una función para manejar los outliers, esto va a funcionar para hacer un transformador personalizado
def outliers_handler(col):
    Q1= data_t[col].quantile(0.25)
    Q3 = data_t[col].quantile(0.75)
    IQR = Q3 - Q1
    upper_limit = Q3 + 1.5 * IQR
    lower_limit = Q1 - 1.5 * IQR

    data_t_out= data_t[(data_t['col'] > lower_limit) & (data_t['col'] < upper_limit)]
    col_no_outliers = data_t_out['col']
    return col_no_outliers

In [None]:
#se seleccionan las columnas que se van a usar para hacer la regresión lineal
selected_cols = ['University Rating','CGPA']

In [None]:
#Este fue el ensayo de crear un transformador personalizado, sin embargo, nos salían errores por lo cual lo dejamos comentado. 
'''
# Crear preprocesador para la columna Univeresity Rating y CGPA
data_preprocessor = ColumnTransformer(transformers = [
    ('selector', 'passthrough', selected_cols),
    ('scaler_transformer', MinMaxScaler(), ['University Rating', 'CGPA']), 
    ('nulls_transformer', SimpleImputer(missing_values=np.nan, strategy='mean'), ['University Rating', 'CGPA']), 
    ('outliers_uni_transformer', FunctionTransformer(outliers_handler, validate = False), ['University Rating']), 
    ('outliers_cgpa_transformer', FunctionTransformer(outliers_handler, validate = False), ['CGPA'])
])
'''


"\n# Crear preprocesador para la columna Univeresity Rating y CGPA\ndata_preprocessor = ColumnTransformer(transformers = [\n    ('selector', 'passthrough', selected_cols),\n    ('scaler_transformer', MinMaxScaler(), ['University Rating', 'CGPA']), \n    ('nulls_transformer', SimpleImputer(missing_values=np.nan, strategy='mean'), ['University Rating', 'CGPA']), \n    ('outliers_uni_transformer', FunctionTransformer(outliers_handler, validate = False), ['University Rating']), \n    ('outliers_cgpa_transformer', FunctionTransformer(outliers_handler, validate = False), ['CGPA'])\n])\n"

In [None]:

#Se crea el preprocesador para las columnas University Rating y CGPA con los transformadores. Se utiliza MinMaxScaler para normalizar los datos, luego se llenan los valores nulos con el promedio y se seleccionan solo las 2 columnas deseadas
data_preprocessor = ColumnTransformer(transformers = [
    ('scaler_transformer', MinMaxScaler(), ['University Rating', 'CGPA']), 
    ('nulls_transformer', SimpleImputer(missing_values=np.nan, strategy='mean'), ['University Rating', 'CGPA']),  
    ('selector', 'passthrough', selected_cols)
])

In [None]:
# Se crea el pipeline
pipe = Pipeline(steps=[('preprocessing', data_preprocessor), ('model',LinearRegression())])

In [None]:
#Se le hace fit al modelo utilizando el pipeline y se hace una predicción
pl=pipe.fit(X_train, y_train)

yhat = pipe.predict(X_val)

***Exportar el modelo***

In [None]:
filename = 'lab4.joblib'
# Se guarda el modelo para poder usarlo en la aplicación
dump(pipe, filename) 

['lab4.joblib']