In [1]:
from google.colab import drive
drive.mount ("/content/drive")

Mounted at /content/drive


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [9]:
filename = "/content/drive/MyDrive/Data CodingDojo/abalones/abalone.data"

df = pd.read_csv(filename)


df.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [11]:
df.columns = ["Sex","Length","Diameter","Height","Whole_weight","Shucked_weight","Viscera_weight","Shell_weight","Rings"]

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4176 non-null   object 
 1   Length          4176 non-null   float64
 2   Diameter        4176 non-null   float64
 3   Height          4176 non-null   float64
 4   Whole_weight    4176 non-null   float64
 5   Shucked_weight  4176 non-null   float64
 6   Viscera_weight  4176 non-null   float64
 7   Shell_weight    4176 non-null   float64
 8   Rings           4176 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


# Matriz de caracteristicas y vector objetivo

In [15]:
X = df.drop(columns="Rings")
y = df["Rings"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

# Selectores de columnas

In [17]:
cat_selector = make_column_selector(dtype_include = "object")
num_selector = make_column_selector(dtype_include = "number")

# Instanciar transformadores

In [18]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown="ignore")

# Emparejar transformador con las columnas

In [19]:
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)

# Instanciar ColumnTransformer

In [22]:
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder="passthrough")

# Encajar el transformador

In [23]:
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f7150b61d30>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f7150b612b0>)])

# nombres de columnas para facilitar visualizacion de datos

In [33]:
columns = col_transformer.get_feature_names_out()


'standardscaler__Length'

# Transformacion

In [24]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [26]:
X_train_processed

array([[-1.54642176, -1.5561698 , -1.05355826, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.79572536,  0.52191671,  0.7068688 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.25201264,  0.31917656,  0.35478338, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.04075575,  0.21780648,  0.23742158, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.41930886,  0.52191671,  0.23742158, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.58660508,  0.57260174,  0.00269797, ...,  1.        ,
         0.        ,  0.        ]])

# Visualizacion de transformaciones en DataFrame

In [35]:
X_train_df = pd.DataFrame(X_test_processed, columns=columns)
X_train_df

Unnamed: 0,standardscaler__Length,standardscaler__Diameter,standardscaler__Height,standardscaler__Whole_weight,standardscaler__Shucked_weight,standardscaler__Viscera_weight,standardscaler__Shell_weight,onehotencoder__Sex_F,onehotencoder__Sex_I,onehotencoder__Sex_M
0,0.753901,0.927397,0.824231,1.115145,0.895581,1.354440,0.380418,1.0,0.0,0.0
1,0.544781,0.572602,0.237422,0.654196,1.141683,0.526742,0.089847,1.0,0.0,0.0
2,0.084716,0.116436,0.120060,0.195286,0.170822,0.140180,0.079085,0.0,1.0,0.0
3,0.963022,0.978082,0.589507,0.802066,0.728502,0.804157,0.868291,1.0,0.0,0.0
4,-0.208052,-0.289044,0.354783,-0.357445,-0.540390,-0.346434,-0.243771,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1039,0.210189,0.065751,-0.114664,-0.112693,-0.262679,-0.018993,0.150831,1.0,0.0,0.0
1040,-0.751765,-0.745209,-0.701473,-0.918334,-0.861000,-0.864882,-0.925358,1.0,0.0,0.0
1041,0.461133,0.724657,0.941592,0.855096,0.606580,0.808705,0.947211,1.0,0.0,0.0
1042,0.963022,1.079452,1.528401,1.434342,1.369722,1.395370,1.173211,1.0,0.0,0.0
