<a href="https://colab.research.google.com/github/FelipeVillegasH/Track3/blob/main/KNN_with_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## KNN with Pipelines
# By Felipe Villegas

In [254]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [255]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [256]:
df=pd.read_csv('/content/drive/MyDrive/CodingDojo/Track 2/Semana 5/Tareas Core/abalone.data')
df.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [257]:
df.isna().sum()
# the data frame is complete

M         0
0.455     0
0.365     0
0.095     0
0.514     0
0.2245    0
0.101     0
0.15      0
15        0
dtype: int64

In [258]:
# Naming the columns: 
df.columns = ['Sex', 'Length(mm)', 'Diameter(mm)', 'Height(mm)', 'Whole Weight(gr)', 'Shucked Weight(gr)', 'Viscera Weight(gr)', 'Shell Weight(gr)', 'Rings']
df.head()
# Aditional Data about the features of the Df by column
# Name / Data Type / Measurement Unit / Description

# Sex / nominal / -- / M, F, and I (infant)
# Length / continuous / mm / Longest shell measurement
#Diameter / continuous / mm / perpendicular to length
# Height / continuous / mm / with meat in shell
# Whole weight / continuous / grams / whole abalone
# Shucked weight / continuous / grams / weight of meat
# Viscera weight / continuous / grams / gut weight (after bleeding)
# Shell weight / continuous / grams / after being dried
# Rings / integer / -- / +1.5 gives the age in years

# Here We realize that we need to calculate and generate another feature column of the age by the Rings

Unnamed: 0,Sex,Length(mm),Diameter(mm),Height(mm),Whole Weight(gr),Shucked Weight(gr),Viscera Weight(gr),Shell Weight(gr),Rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [259]:
# Afterwards we set the target variable in the df by creating the new column of the feature 'Age'
df = df.assign(Age = lambda x: (x['Rings']+1.5))
df.head()
# We could also have used the method .apply which is usefull when operating with multiple rows and columns conditions 
# df.apply(lambda row: row.Rings + 1.5, axis=1)
# and then by naming the new feature-variable-column 
# df['Age'] = df.apply(lambda row: row.Rings + 1.5, axis=1)

Unnamed: 0,Sex,Length(mm),Diameter(mm),Height(mm),Whole Weight(gr),Shucked Weight(gr),Viscera Weight(gr),Shell Weight(gr),Rings,Age
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,8.5
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,10.5
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,11.5
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,8.5
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8,9.5


In [260]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Sex                 4176 non-null   object 
 1   Length(mm)          4176 non-null   float64
 2   Diameter(mm)        4176 non-null   float64
 3   Height(mm)          4176 non-null   float64
 4   Whole Weight(gr)    4176 non-null   float64
 5   Shucked Weight(gr)  4176 non-null   float64
 6   Viscera Weight(gr)  4176 non-null   float64
 7   Shell Weight(gr)    4176 non-null   float64
 8   Rings               4176 non-null   int64  
 9   Age                 4176 non-null   float64
dtypes: float64(8), int64(1), object(1)
memory usage: 326.4+ KB


# Age Prediction

In [261]:
# Selection of the training data base, its important to only select the cuantitavie variables
AgeFeatures= ['Length(mm)',
               'Diameter(mm)',
               'Height(mm)',
               'Whole Weight(gr)',
               'Shucked Weight(gr)',
               'Viscera Weight(gr)',
               'Shell Weight(gr)']
XAge = df.loc[:, AgeFeatures].to_numpy()

In [262]:
XAge.shape

(4176, 7)

In [263]:
# Assigning of the target vector y: 
# Do not mistake to add  extra [] to name the column of the target
yAge = df.loc[:, 'Age'].to_numpy()
yAge.shape

(4176,)

In [264]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XAge, yAge, random_state=3)

In [265]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=2))
pipe.fit(XAge, yAge)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsregressor', KNeighborsRegressor(n_neighbors=2))])

In [266]:
print('Training R2:', pipe.score(XAge, yAge))
print('Testing R2:', pipe.score(X_test, y_test))
# model is not overfitted and predicts 79.80% well in testing data 

Training R2: 0.7913274404861848
Testing R2: 0.7980753761135618


# Sex Prediction

In [267]:
# we select the features to create the XSex training data set
features = ['Length(mm)',
               'Diameter(mm)',
               'Height(mm)',
               'Whole Weight(gr)',
               'Shucked Weight(gr)',
               'Viscera Weight(gr)',
               'Shell Weight(gr)',
               'Rings',
               'Age']

In [268]:
# Again we Loc the variables to do the training data set and ye transform it to numpy
XSex = df.loc[:, features].to_numpy()

In [269]:
# Then we set the target
ySex = df.loc[:, 'Sex'].values

In [270]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XSex, ySex, random_state=3)

In [271]:
from sklearn.pipeline import make_pipeline

In [272]:
pipeKnnClassifier = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=2))
pipeKnnClassifier.fit(XSex, ySex)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=2))])

In [273]:
print('Training R2:', pipeKnnClassifier.score(XSex, ySex))
print('Testing R2:', pipeKnnClassifier.score(X_test, y_test))
# model is not overfitted and predicts 74.61% well in testing data 

Training R2: 0.7610153256704981
Testing R2: 0.7461685823754789


# ¿Por qué querrías ocupar una tubería para KNN?
## Para reducir los pasos de escalado, instanciamiento y modelaje. Y por lo tanto, reducir la probabilidad de errores y aumentar la replicabilidad de nuestros modelos.
# ¿Para qué otros modelos o tareas sería útil usar una tubería?
## Para todos aquellos modelos que necesiten ser escalados. O cualquier modelo que necesite que su data sea pre procesada antes de ejecutar el modelo. 