<a href="https://colab.research.google.com/github/FelipeVillegasH/Tareas_Coding_Dojo_Track2/blob/main/KNN_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## KNN Predictions Supervised Machine Learning
# By Felipe Villegas

In [487]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [488]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [489]:
df=pd.read_csv('/content/drive/MyDrive/CodingDojo/Track 2/Semana 5/Tareas Core/abalone.data')
df.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [490]:
# nombres= pd.read_csv('/content/drive/MyDrive/CodingDojo/Track 2/Semana 5/Tareas Core/abalone.names', on_bad_lines='skip')
# nombres.head()

In [491]:
# nombres.to_numpy()
# the names are actually explicit in the webside so we will ignore this and the previous code

In [492]:
df.isna().sum()
# the data frame is complete

M         0
0.455     0
0.365     0
0.095     0
0.514     0
0.2245    0
0.101     0
0.15      0
15        0
dtype: int64

In [493]:
# Naming the columns: 
df.columns = ['Sex', 'Length(mm)', 'Diameter(mm)', 'Height(mm)', 'Whole Weight(gr)', 'Shucked Weight(gr)', 'Viscera Weight(gr)', 'Shell Weight(gr)', 'Rings']
df.head()
# Aditional Data about the features of the Df by column
# Name / Data Type / Measurement Unit / Description

# Sex / nominal / -- / M, F, and I (infant)
# Length / continuous / mm / Longest shell measurement
#Diameter / continuous / mm / perpendicular to length
# Height / continuous / mm / with meat in shell
# Whole weight / continuous / grams / whole abalone
# Shucked weight / continuous / grams / weight of meat
# Viscera weight / continuous / grams / gut weight (after bleeding)
# Shell weight / continuous / grams / after being dried
# Rings / integer / -- / +1.5 gives the age in years

# Here We realize that we need to calculate and generate another feature column of the age by the Rings

Unnamed: 0,Sex,Length(mm),Diameter(mm),Height(mm),Whole Weight(gr),Shucked Weight(gr),Viscera Weight(gr),Shell Weight(gr),Rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [494]:
# Afterwards we set the target variable in the df by creating the new column of the feature 'Age'
df = df.assign(Age = lambda x: (x['Rings']+1.5))
df.head()
# We could also have used the method .apply which is usefull when operating with multiple rows and columns conditions 
# df.apply(lambda row: row.Rings + 1.5, axis=1)
# and then by naming the new feature-variable-column 
# df['Age'] = df.apply(lambda row: row.Rings + 1.5, axis=1)

Unnamed: 0,Sex,Length(mm),Diameter(mm),Height(mm),Whole Weight(gr),Shucked Weight(gr),Viscera Weight(gr),Shell Weight(gr),Rings,Age
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,8.5
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,10.5
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,11.5
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,8.5
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8,9.5


In [495]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Sex                 4176 non-null   object 
 1   Length(mm)          4176 non-null   float64
 2   Diameter(mm)        4176 non-null   float64
 3   Height(mm)          4176 non-null   float64
 4   Whole Weight(gr)    4176 non-null   float64
 5   Shucked Weight(gr)  4176 non-null   float64
 6   Viscera Weight(gr)  4176 non-null   float64
 7   Shell Weight(gr)    4176 non-null   float64
 8   Rings               4176 non-null   int64  
 9   Age                 4176 non-null   float64
dtypes: float64(8), int64(1), object(1)
memory usage: 326.4+ KB


# Age Prediction

In [496]:
# Selection of the training data base, its important to only select the cuantitavie variables
AgeFeatures= ['Length(mm)',
               'Diameter(mm)',
               'Height(mm)',
               'Whole Weight(gr)',
               'Shucked Weight(gr)',
               'Viscera Weight(gr)',
               'Shell Weight(gr)']
XAge = df.loc[:, AgeFeatures].to_numpy()

In [497]:
XAge.shape

(4176, 7)

In [498]:
# Assigning of the target vector y: 
# Do not mistake to add  extra [] to name the column of the target
yAge = df.loc[:, 'Age'].to_numpy()
yAge.shape

(4176,)

In [499]:
# we Scalate XAge
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(XAge)
# Transform XAge
XAge = scaler.transform(XAge);

In [500]:
# We assign the KNN model to a variable
KnnAge = KNeighborsRegressor(n_neighbors=5)
print(KnnAge)

KNeighborsRegressor()


In [501]:
# Training the prediction model
KnnAge.fit(XAge, yAge)

KNeighborsRegressor()

In [502]:
# Then we execute the prediction with: knn.predict(XAge) but to actually see the predictions we need to assign the results to a variable
PredictedAges= KnnAge.predict(XAge)
PredictedAges

array([ 8.5, 11.1, 10.9, ..., 12.5, 11.5, 12.7])

In [503]:
score = KnnAge.score(XAge, yAge)
score
# the 65.8% of the Variation in the Age is due to the Features selected in the XAge training matrix 

0.6582492859136977

# Sex Prediction

In [504]:
# we select the features to create the XSex training data set
features = ['Length(mm)',
               'Diameter(mm)',
               'Height(mm)',
               'Whole Weight(gr)',
               'Shucked Weight(gr)',
               'Viscera Weight(gr)',
               'Shell Weight(gr)',
               'Rings',
               'Age']

In [505]:
# Again we Loc the variables to do the training data set and ye transform it to numpy
XSex = df.loc[:, features].to_numpy()

In [506]:
# Then we set the target
ySex = df.loc[:, 'Sex'].values

In [507]:
# then we format the data
scaler.fit(XSex)
XSex = scaler.transform(XSex)

In [508]:
# And Create the variable to the KNN
KnnSex = KNeighborsClassifier(n_neighbors=5)
print(KnnSex)

KNeighborsClassifier()


In [509]:
# then we let the KNN Study the training data set 
KnnSex.fit(XSex, ySex)

KNeighborsClassifier()

In [510]:
# And do the predictions considering the correlation of all the features of the training data set
prediccion_sexo = KnnSex.predict(XSex)
prediccion_sexo

array(['I', 'F', 'M', ..., 'F', 'F', 'M'], dtype=object)

In [511]:
scoreSex = KnnSex.score(XSex, ySex)
scoreSex
# the 67.8% of the Variation in the Sex is due to the Features selected in the XSex training matrix 

0.6784003831417624

###¿Podrías haber utilizado la regresión lineal y la regresión KNN para resolver el problema de regresión?
# Sí, los algoritmos de regresion lineal son útiles para predecir tanto como los de Knn, no obstante, se debe analizar el accuracy de esta prueba a partir del R2 ya que los datos podrían no presentar una tendencia lineal y por lo tanto, no sería una predicción adecuada. En dicho caso se aconseja usar KNN. Todo depende de la distribución y la naturaleza de los datos definida por su varianza y sesgo.
# Cabe recalcar que la regresión lineal, es un algoritmo de análisis preliminar y facil de entender pero no siempre es el mas adecuado para el análisis de datos 

###¿Podrías haber utilizado la regresión lineal para el problema de clasificación?
# Los algoritmos de regresión lineal no son capaces de clasificar, por lo tanto, no se podría haber utilizado para el problema de clasificación. 