<a href="https://colab.research.google.com/github/D-Soto/Coco/blob/main/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline

In [None]:
df = pd.read_csv('/content/drive/My Drive/DS/data/abalone.data', header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
df.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 
              'Viscera_weight', 'Shell_weight', 'Rings']

In [None]:
df['Sex'].value_counts()

M    1528
I    1342
F    1307
Name: Sex, dtype: int64

In [None]:
# Arrange data into features matrix & target vector
X = df[['Length', 'Diameter', 'Height', 'Whole_weight', 
        'Shucked_weight', 'Viscera_weight', 'Shell_weight']]
y = df['Rings']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [None]:
regression_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

In [None]:
regression_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsregressor',
                 KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                     metric='minkowski', metric_params=None,
                                     n_jobs=None, n_neighbors=5, p=2,
                                     weights='uniform'))],
         verbose=False)

In [None]:
print("Training R2:", regression_pipe.score(X_train, y_train))
print("Testing R2:", regression_pipe.score(X_test, y_test))

Training R2: 0.6539079854456187
Testing R2: 0.49888197203213314


In [None]:
X = df[['Length', 'Diameter', 'Height', 'Whole_weight', 
        'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']]
y = df['Sex']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [None]:
classification_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
classification_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [None]:
print("Training Accuracy:", classification_pipe.score(X_train, y_train))
print("Testing Accuracy:", classification_pipe.score(X_test, y_test))

Training Accuracy: 0.6749680715197957
Testing Accuracy: 0.5196172248803828




1. Using a pipeline to chain together scaling and KNN makes it easy and minimizes the chance for making errors.
2. Anything that requires scaling would be good to use a pipeline for.