# Imports

In [404]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, recall_score, confusion_matrix, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Obtención de Datasets

In [405]:
!kaggle datasets download eswarchandt/phishing-website-detector -p . --unzip
!kaggle datasets download yasserh/wine-quality-dataset -p . --unzip
!kaggle datasets download utkarshx27/breast-cancer-wisconsin-diagnostic-dataset -p . --unzip
!kaggle datasets download joebeachcapital/30000-spotify-songs -p . --unzip

Dataset URL: https://www.kaggle.com/datasets/eswarchandt/phishing-website-detector
License(s): unknown
Downloading phishing-website-detector.zip to .




  0%|          | 0.00/197k [00:00<?, ?B/s]
100%|██████████| 197k/197k [00:00<00:00, 609kB/s]
100%|██████████| 197k/197k [00:00<00:00, 606kB/s]


Dataset URL: https://www.kaggle.com/datasets/yasserh/wine-quality-dataset
License(s): CC0-1.0
Downloading wine-quality-dataset.zip to .




  0%|          | 0.00/21.5k [00:00<?, ?B/s]
100%|██████████| 21.5k/21.5k [00:00<00:00, 1.34MB/s]


Dataset URL: https://www.kaggle.com/datasets/utkarshx27/breast-cancer-wisconsin-diagnostic-dataset
License(s): CC0-1.0
Downloading breast-cancer-wisconsin-diagnostic-dataset.zip to .




  0%|          | 0.00/47.7k [00:00<?, ?B/s]
100%|██████████| 47.7k/47.7k [00:00<00:00, 543kB/s]


Dataset URL: https://www.kaggle.com/datasets/joebeachcapital/30000-spotify-songs
License(s): DbCL-1.0
Downloading 30000-spotify-songs.zip to .




  0%|          | 0.00/3.01M [00:00<?, ?B/s]
 33%|███▎      | 1.00M/3.01M [00:00<00:01, 2.01MB/s]
100%|█████████▉| 3.00M/3.01M [00:00<00:00, 5.16MB/s]
100%|██████████| 3.01M/3.01M [00:00<00:00, 4.47MB/s]


In [406]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [407]:
pd.set_option('display.max_columns', None)
phishing = pd.read_csv("phishing.csv")
wine = pd.read_csv("WineQT.csv")
cancer = pd.read_csv("brca.csv")
spotify = pd.read_csv("spotify_songs.csv")

# Funciones generales

### Mostrar nulos

In [408]:
def mostrar_nulos(df):
    nulos = df.isnull().sum()[df.isnull().sum() > 0]
    if nulos.empty:
        print("No hay nulos en el DataFrame")
    else:
        print("Columnas con valores nulos:")
        print(nulos)

# Funcion para entrenar modelos

In [409]:
modelos = {
    "SVM": SVC(),
    "Regresion Logistica": LogisticRegression(),
    "Arbol de Decision": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "kNN": KNeighborsClassifier(n_neighbors=5)
}

def evaluar_modelo(X, y, modelo_deseado):
    nombre = modelo_deseado[0]
    modelo = modelo_deseado[1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{nombre}: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# 1. Analisis de Datasets

## 1.1. Phishing

### Primera vista

In [410]:
phishing.head(5)

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,NonStdPort,HTTPSDomainURL,RequestURL,AnchorURL,LinksInScriptTags,ServerFormHandler,InfoEmail,AbnormalURL,WebsiteForwarding,StatusBarCust,DisableRightClick,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,1,1,-1,1,0,0,-1,-1,-1,0,1,1,1,1,1,1,1,-1,1,-1,-1,1


### Eliminamos el identificador

In [411]:
phishing = phishing.drop(columns=["Index"])

In [412]:
X_phishing = phishing.drop(columns=["class"])
y_phishing = phishing["class"]

print(X_phishing.dtypes.value_counts())

int64    30
Name: count, dtype: int64


### Comprobamos nulos

In [413]:
mostrar_nulos(X_phishing)

No hay nulos en el DataFrame


### Describe

In [414]:
X_phishing.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UsingIP,11054.0,0.313914,0.949495,-1.0,-1.0,1.0,1.0,1.0
LongURL,11054.0,-0.633345,0.765973,-1.0,-1.0,-1.0,-1.0,1.0
ShortURL,11054.0,0.738737,0.674024,-1.0,1.0,1.0,1.0,1.0
Symbol@,11054.0,0.700561,0.713625,-1.0,1.0,1.0,1.0,1.0
Redirecting//,11054.0,0.741632,0.670837,-1.0,1.0,1.0,1.0,1.0
PrefixSuffix-,11054.0,-0.734938,0.678165,-1.0,-1.0,-1.0,-1.0,1.0
SubDomains,11054.0,0.064049,0.817492,-1.0,-1.0,0.0,1.0,1.0
HTTPS,11054.0,0.25104,0.911856,-1.0,-1.0,1.0,1.0,1.0
DomainRegLen,11054.0,-0.336711,0.941651,-1.0,-1.0,-1.0,1.0,1.0
Favicon,11054.0,0.628551,0.777804,-1.0,1.0,1.0,1.0,1.0


## 1.2. Cancer Breast

### Primera vista

In [415]:
cancer.head(5)

Unnamed: 0.1,Unnamed: 0,x.radius_mean,x.texture_mean,x.perimeter_mean,x.area_mean,x.smoothness_mean,x.compactness_mean,x.concavity_mean,x.concave_pts_mean,x.symmetry_mean,x.fractal_dim_mean,x.radius_se,x.texture_se,x.perimeter_se,x.area_se,x.smoothness_se,x.compactness_se,x.concavity_se,x.concave_pts_se,x.symmetry_se,x.fractal_dim_se,x.radius_worst,x.texture_worst,x.perimeter_worst,x.area_worst,x.smoothness_worst,x.compactness_worst,x.concavity_worst,x.concave_pts_worst,x.symmetry_worst,x.fractal_dim_worst,y
0,1,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259,B
1,2,13.08,15.71,85.63,520.0,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,0.1852,0.7477,1.383,14.67,0.004097,0.01898,0.01698,0.00649,0.01678,0.002425,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183,B
2,3,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,0.06905,0.2773,0.9768,1.909,15.7,0.009606,0.01432,0.01985,0.01421,0.02027,0.002968,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773,B
3,4,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,0.1839,2.342,1.17,14.16,0.004352,0.004899,0.01343,0.01164,0.02671,0.001777,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169,B
4,5,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,0.06503,0.1563,0.9567,1.094,8.205,0.008968,0.01646,0.01588,0.005917,0.02574,0.002582,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409,B


### Eliminamos el identificador

In [416]:
cancer = cancer.drop(columns=["Unnamed: 0"])

### Separamos los datos y comprobamos que las columnas sean numericas

In [417]:
X_cancer = cancer.drop(columns=["y"])
y_cancer = cancer["y"]

print(X_cancer.dtypes.value_counts())

float64    30
Name: count, dtype: int64


### Comprobamos nulos

In [418]:
mostrar_nulos(X_cancer)

No hay nulos en el DataFrame


### Transformamos la variable objetivo a numerico, donde -1 no tendria cancer y 1 si

In [419]:
y_cancer.unique()

array(['B', 'M'], dtype=object)

In [420]:
y_cancer = y_cancer.map({"B": -1, "M": 1})

### Describe

In [421]:
X_cancer.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x.radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
x.texture_mean,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
x.perimeter_mean,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
x.area_mean,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
x.smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
x.compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
x.concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
x.concave_pts_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
x.symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
x.fractal_dim_mean,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


## 1.3. Wine

### Primera vista

In [422]:
wine.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


### Eliminamos columna identificativa

In [423]:
wine = wine.drop(columns=['Id'])

### Separamos los datos y comprobamos que las columnas sean numericas

In [424]:
X_wine = wine.drop(columns=["quality"])
y_wine = wine["quality"]

print(X_wine.dtypes.value_counts())

float64    11
Name: count, dtype: int64


### Comprobamos nulos

In [425]:
mostrar_nulos(X_wine)

No hay nulos en el DataFrame


### Describe

In [426]:
X_wine.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1143.0,8.311111,1.747595,4.6,7.1,7.9,9.1,15.9
volatile acidity,1143.0,0.531339,0.179633,0.12,0.3925,0.52,0.64,1.58
citric acid,1143.0,0.268364,0.196686,0.0,0.09,0.25,0.42,1.0
residual sugar,1143.0,2.532152,1.355917,0.9,1.9,2.2,2.6,15.5
chlorides,1143.0,0.086933,0.047267,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1143.0,15.615486,10.250486,1.0,7.0,13.0,21.0,68.0
total sulfur dioxide,1143.0,45.914698,32.78213,6.0,21.0,37.0,61.0,289.0
density,1143.0,0.99673,0.001925,0.99007,0.99557,0.99668,0.997845,1.00369
pH,1143.0,3.311015,0.156664,2.74,3.205,3.31,3.4,4.01
sulphates,1143.0,0.657708,0.170399,0.33,0.55,0.62,0.73,2.0


## 1.4. Spotify

### Primera vista

In [427]:
spotify.head(5)

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


### Eliminamos columnas identificativas

In [428]:
spotify = spotify.drop(columns=["track_id", "track_name", "track_album_id", "playlist_id"])

In [429]:
len(spotify["track_album_name"].unique())

19744

### Vamos a reemplazar algunas columnas por la media de su popularidad (Target Encoding)


[Ejercicio donde vi este metodo (mio)](https://github.com/Doradux/IABD-Course/blob/main/MIA/encoding_dataset/tecnicas-de-encoding.ipynb)

In [430]:
categorical_columns = ['track_album_name', 'track_artist', 'playlist_genre', 'playlist_subgenre', 'playlist_name']

# Aplicar Target Encoding a cada columna categórica
for col in categorical_columns:
    mean_income_per_category = spotify.groupby(col)['track_popularity'].mean()
    spotify[col] = spotify[col].map(mean_income_per_category)

spotify.head()

Unnamed: 0,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_name,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,74.231884,66,66.0,2019-06-14,59.628571,47.74487,52.079353,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,53.042254,67,67.0,2019-12-13,59.628571,47.74487,52.079353,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,57.178571,70,70.0,2019-07-05,59.628571,47.74487,52.079353,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,57.699187,60,59.0,2019-07-19,59.628571,47.74487,52.079353,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,83.714286,69,69.0,2019-03-05,59.628571,47.74487,52.079353,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


### Vamos a reemplazar las fechas de salida por los anios desde que salio el albun

In [431]:
def parse_date(date):
    if len(date) == 4:
        return date + "-01-01"
    return date

spotify["track_album_release_date"] = spotify["track_album_release_date"].astype(str).apply(parse_date)
spotify["track_album_release_date"] = pd.to_datetime(spotify["track_album_release_date"], format="%Y-%m-%d", errors="coerce")

spotify["album_year"] = spotify["track_album_release_date"].dt.year
spotify["album_month"] = spotify["track_album_release_date"].dt.month
spotify["album_day"] = spotify["track_album_release_date"].dt.day

spotify["album_age"] = datetime.now().year - spotify["album_year"]

spotify = spotify.drop(columns=["track_album_release_date", "album_year", "album_month", "album_day"])

In [432]:
spotify.head(5)

Unnamed: 0,track_artist,track_popularity,track_album_name,playlist_name,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,album_age
0,74.231884,66,66.0,59.628571,47.74487,52.079353,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,6.0
1,53.042254,67,67.0,59.628571,47.74487,52.079353,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,6.0
2,57.178571,70,70.0,59.628571,47.74487,52.079353,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,6.0
3,57.699187,60,59.0,59.628571,47.74487,52.079353,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,6.0
4,83.714286,69,69.0,59.628571,47.74487,52.079353,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,6.0


### Mostrar nulos

In [433]:
mostrar_nulos(spotify)

Columnas con valores nulos:
track_artist         5
track_album_name     5
album_age           31
dtype: int64


Al ser tan poquitos vamos a optar por eliminarlos

In [434]:
spotify = spotify.dropna()

### Separamos datos

In [435]:
X_spotify = spotify.drop(columns=["track_popularity"])
y_spotify = spotify["track_popularity"]

print(X_spotify.dtypes.value_counts())

float64    15
int64       3
Name: count, dtype: int64


### Vamos a convertir el escore de 0 a 100 a de 0 a 10 enteros

In [436]:
y_spotify = (y_spotify / 10).astype(int)

### Describe

In [437]:
X_spotify.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
track_artist,32797.0,42.485138,18.567325,0.0,30.4,43.222222,56.0,97.0
track_album_name,32797.0,42.481568,23.005836,0.0,27.0,44.5,60.0,100.0
playlist_name,32797.0,42.479473,15.889844,3.548387,31.714286,41.888889,54.348361,83.186235
playlist_genre,32797.0,42.477093,4.372066,34.833526,41.223532,43.215454,47.026576,47.74487
playlist_subgenre,32797.0,42.478749,8.683793,26.867883,35.510258,42.725142,51.099842,56.825509
danceability,32797.0,0.654963,0.145057,0.0,0.563,0.672,0.761,0.983
energy,32797.0,0.698691,0.18085,0.000175,0.581,0.721,0.84,1.0
key,32797.0,5.37479,3.611741,0.0,2.0,6.0,9.0,11.0
loudness,32797.0,-6.716466,2.987018,-46.448,-8.167,-6.164,-4.644,1.275
mode,32797.0,0.565692,0.495673,0.0,0.0,1.0,1.0,1.0


# 2. SVM

Predice la probabilidad de que una muestra pertenezca a una clase basándose en la independencia de las características.

Imagina que tienes dos grupos de puntos en un papel y quieres dibujar una línea que los separe. SVM busca la mejor línea posible para dividirlos, maximizando la distancia entre la línea y los puntos más cercanos de cada grupo.

- En 2D, es una línea.
- En 3D, es un plano.
- En más dimensiones, se llama hiperplano.

**<p style="color: green;">Ventajas</p>**

- Funciona bien con datos complejos y alta dimensionalidad.
- Puede usar diferentes funciones "kernel" para mejorar la separación.

**<p style="color: crimson;">Desventajas</p>**

- Puede ser lento con grandes volúmenes de datos.
- Sensible al ajuste de hiperparámetros como C y el tipo de kernel.

## 2.1. Phishing

In [438]:
evaluar_modelo(X_phishing, y_phishing, list(modelos.items())[0])

SVM: 0.9512
              precision    recall  f1-score   support

          -1       0.96      0.92      0.94       976
           1       0.94      0.97      0.96      1235

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211



## 2.2. Cancer Breast

In [439]:
evaluar_modelo(X_cancer, y_cancer, list(modelos.items())[0])

SVM: 0.9298
              precision    recall  f1-score   support

          -1       0.92      0.97      0.95        71
           1       0.95      0.86      0.90        43

    accuracy                           0.93       114
   macro avg       0.93      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114



## 2.3. Wine

In [440]:
evaluar_modelo(X_wine, y_wine, list(modelos.items())[0])

SVM: 0.5633
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.73      0.43      0.54        96
           6       0.51      0.88      0.64        99
           7       1.00      0.04      0.07        26
           8       0.00      0.00      0.00         2

    accuracy                           0.56       229
   macro avg       0.45      0.27      0.25       229
weighted avg       0.64      0.56      0.51       229



## 2.4. Spotify

In [441]:
evaluar_modelo(X_spotify, y_spotify, list(modelos.items())[0])

SVM: 0.1812
              precision    recall  f1-score   support

           0       0.20      0.59      0.30      1013
           1       0.00      0.00      0.00       392
           2       0.00      0.00      0.00       439
           3       0.00      0.00      0.00       853
           4       0.28      0.02      0.04       937
           5       0.16      0.56      0.25      1028
           6       0.00      0.00      0.00       932
           7       0.00      0.00      0.00       662
           8       0.00      0.00      0.00       246
           9       0.00      0.00      0.00        58

    accuracy                           0.18      6560
   macro avg       0.06      0.12      0.06      6560
weighted avg       0.10      0.18      0.09      6560



# 3. Regresion Logistica

Predice la probabilidad de que una muestra pertenezca a una clase basándose en una función logística (sigmoide).

Imagina que tienes dos grupos de puntos en un papel y en lugar de dibujar una línea para separarlos, calculas una probabilidad de pertenecer a un grupo u otro.

- Si la probabilidad es mayor a 0.5, lo clasifica en una clase.
- Si es menor a 0.5, lo clasifica en la otra.

**<p style="color: green;">Ventajas</p>**
- Funciona bien en problemas de clasificación binaria

**<p style="color: crimson;">Deventajas</p>**
No funciona bien con datos no lineales

## 3.1. Phishing

In [442]:
evaluar_modelo(X_phishing, y_phishing, list(modelos.items())[1])

Regresion Logistica: 0.9335
              precision    recall  f1-score   support

          -1       0.94      0.91      0.92       976
           1       0.93      0.95      0.94      1235

    accuracy                           0.93      2211
   macro avg       0.93      0.93      0.93      2211
weighted avg       0.93      0.93      0.93      2211



## 3.2. Cancer Breast

In [443]:
evaluar_modelo(X_cancer, y_cancer, list(modelos.items())[1])

Regresion Logistica: 0.9298
              precision    recall  f1-score   support

          -1       0.97      0.92      0.94        71
           1       0.87      0.95      0.91        43

    accuracy                           0.93       114
   macro avg       0.92      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114



## 3.3. Wine

In [444]:
evaluar_modelo(X_wine, y_wine, list(modelos.items())[1])

Regresion Logistica: 0.6070
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.64      0.74      0.69        96
           6       0.58      0.65      0.61        99
           7       0.50      0.15      0.24        26
           8       0.00      0.00      0.00         2

    accuracy                           0.61       229
   macro avg       0.34      0.31      0.31       229
weighted avg       0.58      0.61      0.58       229



## 3.4. Spotify

In [445]:
evaluar_modelo(X_spotify, y_spotify, list(modelos.items())[1])

Regresion Logistica: 0.3116
              precision    recall  f1-score   support

           0       0.55      0.75      0.64      1013
           1       0.00      0.00      0.00       392
           2       0.00      0.00      0.00       439
           3       0.24      0.21      0.22       853
           4       0.18      0.23      0.20       937
           5       0.26      0.36      0.31      1028
           6       0.28      0.55      0.37       932
           7       0.00      0.00      0.00       662
           8       0.00      0.00      0.00       246
           9       0.00      0.00      0.00        58

    accuracy                           0.31      6560
   macro avg       0.15      0.21      0.17      6560
weighted avg       0.22      0.31      0.26      6560



# 4. Arboles de decision

Es un modelo que clasifica los datos mediante preguntas y respuestas en forma de árbol.  

Imagina que quieres clasificar si un animal es doméstico o salvaje.  
Cada nodo del árbol representa una pregunta y cada rama representa una posible respuesta.  

**<p style="color: green;">Ventajas</p>**

- Fácil de interpretar, maneja datos no lineales.  

**<p style="color: crimson;">Desventajas</p>**

- Puede sobreajustar los datos si es muy profundo.  



## 4.1. Phishing

In [446]:
evaluar_modelo(X_phishing, y_phishing, list(modelos.items())[2])

Arbol de Decision: 0.9597
              precision    recall  f1-score   support

          -1       0.95      0.96      0.95       976
           1       0.97      0.96      0.96      1235

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211



## 4.2. Cancer Breast

In [447]:
evaluar_modelo(X_cancer, y_cancer, list(modelos.items())[2])

Arbol de Decision: 0.9298
              precision    recall  f1-score   support

          -1       0.94      0.94      0.94        71
           1       0.91      0.91      0.91        43

    accuracy                           0.93       114
   macro avg       0.93      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114



## 4.3. Wine

In [448]:
evaluar_modelo(X_wine, y_wine, list(modelos.items())[2])

Arbol de Decision: 0.5590
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.11      0.17      0.13         6
           5       0.64      0.60      0.62        96
           6       0.56      0.53      0.54        99
           7       0.55      0.62      0.58        26
           8       0.33      0.50      0.40         2

    accuracy                           0.56       229
   macro avg       0.37      0.40      0.38       229
weighted avg       0.58      0.56      0.57       229



## 4.4. Spotify

In [449]:
evaluar_modelo(X_spotify, y_spotify, list(modelos.items())[2])

Arbol de Decision: 0.6858
              precision    recall  f1-score   support

           0       0.81      0.82      0.82      1013
           1       0.61      0.63      0.62       392
           2       0.55      0.54      0.55       439
           3       0.66      0.64      0.65       853
           4       0.64      0.63      0.63       937
           5       0.67      0.63      0.65      1028
           6       0.68      0.67      0.68       932
           7       0.70      0.77      0.73       662
           8       0.81      0.85      0.83       246
           9       0.89      0.88      0.89        58

    accuracy                           0.69      6560
   macro avg       0.70      0.71      0.70      6560
weighted avg       0.69      0.69      0.69      6560



# 5. Bosques aleatorios

Es un modelo de aprendizaje automático basado en la combinación de múltiples Árboles de Decisión. En lugar de confiar en un solo árbol, genera varios árboles con diferentes subconjuntos de datos y toma la predicción más votada (en clasificación) o el promedio (en regresión). Esto lo hace más preciso y robusto ante el sobreajuste.

**<p style="color: green;">Ventajas</p>**
- Reduce el sobreajuste en comparación con un solo Árbol de Decisión
- Funciona bien con datos complejos y con muchas variables
- Puede manejar valores faltantes y datos ruidosos

**<p style="color: crimson;">Desventajas</p>**

- Es más lento en entrenamiento y predicción que un solo árbol
- Es menos interpretable debido a la combinación de múltiples modelos

## 5.1. Phishing

In [450]:
evaluar_modelo(X_phishing, y_phishing, list(modelos.items())[3])

Random Forest: 0.9688
              precision    recall  f1-score   support

          -1       0.97      0.96      0.96       976
           1       0.97      0.97      0.97      1235

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



## 5.2. Cancer Breast

In [451]:
evaluar_modelo(X_cancer, y_cancer, list(modelos.items())[3])

Random Forest: 0.9474
              precision    recall  f1-score   support

          -1       0.95      0.97      0.96        71
           1       0.95      0.91      0.93        43

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



## 5.3. Wine

In [452]:
evaluar_modelo(X_wine, y_wine, list(modelos.items())[3])

Random Forest: 0.7074
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.72      0.79      0.75        96
           6       0.68      0.69      0.68        99
           7       0.78      0.69      0.73        26
           8       0.00      0.00      0.00         2

    accuracy                           0.71       229
   macro avg       0.44      0.43      0.43       229
weighted avg       0.68      0.71      0.69       229



## 5.4. Spotify

In [453]:
evaluar_modelo(X_spotify, y_spotify, list(modelos.items())[3])

Random Forest: 0.7729
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      1013
           1       0.75      0.68      0.71       392
           2       0.65      0.67      0.66       439
           3       0.74      0.77      0.75       853
           4       0.70      0.78      0.74       937
           5       0.76      0.77      0.76      1028
           6       0.77      0.77      0.77       932
           7       0.84      0.80      0.82       662
           8       0.92      0.79      0.85       246
           9       0.96      0.88      0.92        58

    accuracy                           0.77      6560
   macro avg       0.80      0.77      0.78      6560
weighted avg       0.78      0.77      0.77      6560



# 6. kNN

Es un algoritmo basado en la proximidad. Para predecir la clase de una nueva muestra, busca los k puntos más cercanos en los datos de entrenamiento y asigna la clase más común entre ellos. No crea un modelo previo, sino que almacena los datos y calcula distancias en el momento de la predicción.

Ejemplo práctico
Si queremos clasificar un correo como spam o no spam, kNN buscaría los k correos más similares en base a palabras clave y asignaría la categoría mayoritaria entre ellos.

**<p style="color: green;">Ventajas</p>**
Fácil de entender e implementar
No requiere entrenamiento, solo almacenamiento de datos
Funciona bien con datos pequeños y bien distribuidos
**<p style="color: crimson;">Desventajas</p>**
Puede ser lento con grandes volúmenes de datos
Sensible a la elección de k y a la escala de las características
No funciona bien con datos de alta dimensión sin preprocesamiento

## 6.1. Phishing

In [454]:
evaluar_modelo(X_phishing, y_phishing, list(modelos.items())[4])

kNN: 0.9412
              precision    recall  f1-score   support

          -1       0.94      0.93      0.93       976
           1       0.94      0.95      0.95      1235

    accuracy                           0.94      2211
   macro avg       0.94      0.94      0.94      2211
weighted avg       0.94      0.94      0.94      2211



## 6.2. Cancer Breast

In [455]:
evaluar_modelo(X_cancer, y_cancer, list(modelos.items())[4])

kNN: 0.9386
              precision    recall  f1-score   support

          -1       0.96      0.94      0.95        71
           1       0.91      0.93      0.92        43

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114



## 6.3. Wine

In [456]:
evaluar_modelo(X_wine, y_wine, list(modelos.items())[4])

kNN: 0.5153
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.53      0.71      0.61        96
           6       0.50      0.43      0.46        99
           7       0.54      0.27      0.36        26
           8       0.00      0.00      0.00         2

    accuracy                           0.52       229
   macro avg       0.31      0.28      0.29       229
weighted avg       0.50      0.52      0.50       229



## 6.4. Spotify

In [457]:
evaluar_modelo(X_spotify, y_spotify, list(modelos.items())[4])

kNN: 0.3128
              precision    recall  f1-score   support

           0       0.42      0.54      0.47      1013
           1       0.17      0.15      0.16       392
           2       0.15      0.15      0.15       439
           3       0.24      0.26      0.25       853
           4       0.23      0.28      0.25       937
           5       0.28      0.27      0.27      1028
           6       0.36      0.28      0.31       932
           7       0.44      0.31      0.36       662
           8       0.58      0.50      0.54       246
           9       0.73      0.78      0.75        58

    accuracy                           0.31      6560
   macro avg       0.36      0.35      0.35      6560
weighted avg       0.32      0.31      0.31      6560



# 7. Conclusiones
## 7.1. SVM
Este modelo consigue buenas metricas en el dataset de Phishing (95% de acierto) y en el de Cancer Breast (94%), reduce mucho su eficacia en el dataset de Wine donde consigue acertar poro mas de la mitad (56%) y fracasa por completo en el dataset de Spotify (18%)

## 7.2. Regresion Logistica
Este modelo sigue siendo bastante bueno con los dos primeros datasets, 93 y 95 de porcentaje de acierto, y predice mejor que SVM los otros dos dataset, conseguimos aumentar alrededor de un 10% en ambos, 61% en Wine y 31% en Spotify

## 7.3. Arboles de decision
Los primeros dos modelos siguen teniendo mas de un 90% de acierto, en Wine vemos como ha disminuido un poco respecto al modelo de Regresion Logistica con un 56% de acierto mientras que para el dataset de Spotify se consigue una gran mejora, un 68% de acierto

## 7.4. Bosques aleatorios
Este modelo es el que mejor funciona para estos datasets en su conjunto de momento, vemos como los dos primeros siguen prediciendo casi al 100% con unos valores de 97% y 94%, y los otros datasets aumentan ambos con un resultado sorprendente para Spotify. En Wine obtiene un 70% de aciertos y Spotify consigue un **77%** que es su mejor valor hasta ahora

## 7.5. kNN
Con este modelo seguimos teniendo buenos resultados para los datasets de phishing y Cancer Breast con un 94% en ambos pero nos encontramos con una peor precision para los otros dos modelos, un 52% de acierto con el dataframe de Wine y un 31% en el de Spotify

## 7.6. Mejor modelo para cada parametro
- Phishin &rarr; kNN (97%)
- Cancer Breast &rarr; Regresion Logistica (95%)
- Wine &rarr; Bosques aleatorios (70%)
- Spotify &rarr; Bosques aleatorios (77%)

# 8. Uso de hiperparametros

Para ello vamos a definir una nueva funcion como hizimos antes para poder aplicar estos hiperparametros y obtener el mejor modelo

In [458]:
def ajustar_y_evaluar_modelo(X, y, modelo_deseado, grid_params, random_search=False):
    nombre = modelo_deseado[0]
    modelo = modelo_deseado[1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    if random_search:
        grid_search = RandomizedSearchCV(modelo, grid_params, n_iter=10, cv=5, random_state=42, n_jobs=-1)
    else:
        grid_search = GridSearchCV(modelo, grid_params, cv=5, n_jobs=-1)
    
    grid_search.fit(X_train, y_train)
    
    modelo_ajustado = grid_search.best_estimator_

    y_pred = modelo_ajustado.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{nombre} con mejores parámetros: {grid_search.best_params_}")
    print(f"Precisión: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    
    return modelo_ajustado


## 8.1. SVM

- C: Valor de penalización  

Este parámetro controla la regularización del modelo. Un valor pequeño permite que el modelo se ajuste más a los datos, pero puede sobreajustarse. Un valor grande evita que el modelo se ajuste a los errores, pero podría ser demasiado rígido.

- kernel: Tipo de kernel 

El kernel define cómo se proyectan los datos en un espacio de mayor dimensión para hacerlos linealmente separables. linear es para datos que ya son separables de forma lineal, mientras que rbf es adecuado para datos no lineales.

"linear" &rarr; Utiliza un hiperplano lineal para separar los datos en SVM. Es adecuado cuando los datos son linealmente separables, es decir, pueden ser divididos por una sola línea (o plano en dimensiones mayores).

"rbf" &rarr Es un tipo de kernel no lineal que transforma los datos a un espacio de mayor dimensión para encontrar un hiperplano que los separe de forma no lineal. Es ideal para datos que no son linealmente separables y es ampliamente utilizado en SVM debido a su capacidad de manejar relaciones complejas entre las características.

- gamma: Parámetro del kernel

El parámetro gamma controla el impacto de un solo punto de entrenamiento en la decisión final del modelo cuando se usa un kernel no lineal, como el RBF (Radial Basis Function) o el sigmoide.

"scale" &rarr; Ajusta el parámetro gamma de acuerdo con el número de características en los datos

"auto" &rarr; útil cuando se quiere un valor simple y no se tiene suficiente conocimiento sobre la varianza de los datos.

"sin valor" &rarr; Si no se especifica, el valor predeterminado de gamma en algunos modelos puede ser 'scale', dependiendo de la implementación del modelo (por ejemplo, en SVM).

In [459]:
grid_params_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}



XsYs = {
  "PHISHING": [X_phishing, y_phishing],
  "CANCER": [X_cancer, y_cancer],
  "WINE": [X_wine, y_wine],
  # "SPOTIFY": [X_spotify, y_spotify]
}

modelos_ajustados_svm = {};

for clave, valor in XsYs.items():
    print(f"================== {clave} ==================")
    modelo_ajustado = ajustar_y_evaluar_modelo(valor[0], valor[1], list(modelos.items())[0], grid_params_svm)
    modelos_ajustados_svm[clave] = modelo_ajustado

SVM con mejores parámetros: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Precisión: 0.9629
              precision    recall  f1-score   support

          -1       0.97      0.95      0.96       976
           1       0.96      0.97      0.97      1235

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211

SVM con mejores parámetros: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Precisión: 0.9386
              precision    recall  f1-score   support

          -1       0.96      0.94      0.95        71
           1       0.91      0.93      0.92        43

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114

SVM con mejores parámetros: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Precisión: 0.6114
              precision    recall  f1-score   support

           4 

## 8.2. Regresion Logistica

- C: Inversa de la regularización
- solver: Método de optimización

El parámetro solver define el algoritmo utilizado para optimizar la función de pérdida.

"liblinear" &rarr; Es un algoritmo de optimización utilizado para resolver problemas de regresión logística y máquinas de soporte vectorial (SVM). Es eficiente para conjuntos de datos pequeños a medianos y funciona bien con problemas lineales, pero no maneja grandes volúmenes de datos de manera eficiente.

"saga" &rarr; Es un algoritmo más moderno y eficiente para resolver regresión logística y otros problemas de optimización. Es adecuado para grandes conjuntos de datos, especialmente cuando se utilizan regularizaciones como L1 o ElasticNet, y es más rápido en problemas de alta dimensionalidad.

- max_iter: Número máximo de iteraciones

Define el número máximo de iteraciones del algoritmo de optimización antes de detenerse.

In [460]:
XsYs["SPOTIFY"] = [X_spotify, y_spotify]

In [461]:

grid_params_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

modelos_ajustados_lr = {};

for clave, valor in XsYs.items():
    print(f"================== {clave} ==================")
    modelo_ajustado = ajustar_y_evaluar_modelo(valor[0], valor[1], list(modelos.items())[1], grid_params_lr)
    modelos_ajustados_lr[clave] = modelo_ajustado



Regresion Logistica con mejores parámetros: {'C': 0.1, 'max_iter': 100, 'solver': 'saga'}
Precisión: 0.9349
              precision    recall  f1-score   support

          -1       0.94      0.91      0.92       976
           1       0.93      0.96      0.94      1235

    accuracy                           0.93      2211
   macro avg       0.94      0.93      0.93      2211
weighted avg       0.94      0.93      0.93      2211

Regresion Logistica con mejores parámetros: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Precisión: 0.9561
              precision    recall  f1-score   support

          -1       0.97      0.96      0.96        71
           1       0.93      0.95      0.94        43

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114

Regresion Logistica con mejores parámetros: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Precisión: 0.6332
              

## 8.3. Arboles de decision

- Max Depth

Controla la profundidad máxima de los árboles. Una mayor profundidad puede sobreajustarse, mientras que una menor puede no capturar toda la complejidad de los datos.

- Min Samples Split

El parámetro min_samples_split define el número mínimo de muestras necesarias en un nodo para que se realice una división (split).

- Criterion

El parámetro criterion define la métrica que el modelo usa para medir la calidad de una división en los nodos de los árboles de decisión o bosques aleatorios.  

"gini" &rarr; Utiliza el índice de Gini para medir la impureza de los nodos en el árbol. Busca dividir los datos de manera que se reduzca la probabilidad de clasificación incorrecta.  

"entropy" &rarr; Utiliza la entropía, que mide la incertidumbre o desorden en los nodos. Intenta reducir la entropía en cada división para obtener las mejores particiones de los datos.


In [462]:
grid_params_dt = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 10, 20],
    'criterion': ['gini', 'entropy']
}

modelos_ajustados_dt = {};


for clave, valor in XsYs.items():
    print(f"================== {clave} ==================")
    modelo_ajustado = ajustar_y_evaluar_modelo(valor[0], valor[1], list(modelos.items())[2], grid_params_dt)
    modelos_ajustados_dt[clave] = modelo_ajustado

Arbol de Decision con mejores parámetros: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 2}
Precisión: 0.9607
              precision    recall  f1-score   support

          -1       0.96      0.95      0.96       976
           1       0.96      0.97      0.96      1235

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211

Arbol de Decision con mejores parámetros: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2}
Precisión: 0.9386
              precision    recall  f1-score   support

          -1       0.93      0.97      0.95        71
           1       0.95      0.88      0.92        43

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114

Arbol de Decision con mejores parámetros: {'criterion': 'entropy', 'max_depth': None, 'min_s

## 8.4. Bosques aleatorios

- N Estimators

Número de árboles en el bosque. Un mayor número mejora la precisión, pero aumenta el tiempo de entrenamiento.

- Max depth
- Min Samples Split

El parámetro min_samples_split define el número mínimo de muestras necesarias para dividir un nodo en un árbol de decisión o en un bosque aleatorio.

In [463]:
grid_params_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 10, 20]
}

modelos_ajustados_rf = {};

for clave, valor in XsYs.items():
    print(f"================== {clave} ==================")
    modelo_ajustado = ajustar_y_evaluar_modelo(valor[0], valor[1], list(modelos.items())[3], grid_params_rf)
    modelos_ajustados_rf[clave] = modelo_ajustado

Random Forest con mejores parámetros: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Precisión: 0.9679
              precision    recall  f1-score   support

          -1       0.97      0.96      0.96       976
           1       0.97      0.98      0.97      1235

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211

Random Forest con mejores parámetros: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Precisión: 0.9474
              precision    recall  f1-score   support

          -1       0.93      0.99      0.96        71
           1       0.97      0.88      0.93        43

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114

Random Forest con mejores parámetros: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 300}
Preci

## 8.5. nKK

- N Neighbors

Número de vecinos que se considerarán para la clasificación. Si es muy pequeño, el modelo podría ser muy sensible a los ruidos. Si es muy grande, podría perder detalles importantes.

- Weights

Controla cómo se ponderan los vecinos al tomar una decisión.  

"uniform" &rarr; Todos los vecinos tienen el mismo peso en la votación, sin importar su distancia.  

"distance" &rarr; Los vecinos más cercanos tienen mayor influencia en la clasificación, reduciendo el impacto de los más lejanos.

- Metric

Define la métrica de distancia usada para encontrar los vecinos más cercanos.  

"euclidean" &rarr; cuando las características tienen la misma escala e importancia, y la distancia real entre puntos es relevante para la clasificación.

"manhattan" &rarr; Si los datos tienen una estructura de cuadrícula (por ejemplo, calles de una ciudad).

In [464]:
grid_params_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

modelos_ajustados_knn = {};


for clave, valor in XsYs.items():
    print(f"================== {clave} ==================")
    modelo_ajustado = ajustar_y_evaluar_modelo(valor[0], valor[1], list(modelos.items())[4], grid_params_knn)
    modelos_ajustados_knn[clave] = modelo_ajustado

kNN con mejores parámetros: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Precisión: 0.9607
              precision    recall  f1-score   support

          -1       0.97      0.94      0.95       976
           1       0.96      0.97      0.97      1235

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211

kNN con mejores parámetros: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'}
Precisión: 0.9474
              precision    recall  f1-score   support

          -1       0.96      0.96      0.96        71
           1       0.93      0.93      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

kNN con mejores parámetros: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Precisión: 0.6245
              

# 9. Conclusiones (hiperparametros)

## 9.1. SVM
Phishing mejora su precision un 1%  
Cancer Breast reduce su precion un 2%  
Wine aumenta su precision considerablemente, pasa de un 56% a un 74%

Cada modelo usa unos hiperparametros distintos para conseguir su mejor version, siendo estos:

C: [10, 10, 1] (respectivamente)  
kernel: [rbf, linear, linear]  
gamma: scale en todos

## 9.2. Regresion Logistica
Phishing se mantiene igual que sin parametrizar  
Cancer Breast reduce su precision un 1%  
Wine aumenta su precision considerablemente, pasa de un 61% a un 63%  
Y Spotify consigue una pequena mejora, pasa de un 31% a un 39% de precision, pero sigue siendo un mal modelo

C: [0.1, 10, 10, 0.1]  
Max iter: 100 los 4  
Solver: [saga, liblinear, liblinear, liblinear]

## 9.3. Arboles de decision
Phishing se mantiene igual que sin parametrizar  
Cancer Breast disminuye su precion un 1%  
Wine se mantiene igual que sin parametrizar  
Y Spotify consigue una mejora, pasa de un 69% a un 74% de precision

Criterion: [entropy, gini, entropy, gini]  
Max depth: [None, 10, 20, 10]  
Min samples split: [2, 20, 2, 2]

## 9.4. Bosques aleatorios
Phishing se mantiene igual que sin parametrizar  
Cancer Breast aumenta su precision un 2%  
Wine aumenta su precion un 1%  
Y Spotify aumenta su precision un 1%

Max depth: [None, 10, None, None]  
Min samples split: 2 en todos los casos  
N estimators: [200, 200, 200, 300]  

## 9.5. kNN
Phishing aumenta un 2%  
Cancer Breast aumenta su precision un 1%  
Wine y Spotify consiguen mejoras, Wine de 52% &rarr; 62% y Spotify de 31% &rarr; 44%

Metric: manhattan en todos los casos  
N neighbors: [3, 7, 7, 3]  
Weights: [distance, uniform, distance, distance]  

## 9.6. Mejor modelo para cada parametro

    [PARAMETRIZADO] // [SIN PARAMETRIZAR]

- Phishin &rarr; Bosques aleatorios (97%) // (kNN (97%))
- Cancer Breast &rarr; Bosques aleatorios  y Regresion Logistica (96%) // (Regresion Logistica (95%))
- Wine &rarr; Bosques aleatorios (71%) // (Bosques aleatorios (70%))
- Spotify &rarr; Bosques aleatorios (78%) // (Bosques aleatorios (77%))

# 10. Uso de hiperparametros, mostrando 2 metricas nuevas

## 10.1. Redefinicion de la funcion para obtener 2 nuevas metricas

Modificamos el codigo para evaluar en este caso un modelo con los hiperparametros fijos (los mejores del modelo que se mostraron arriba) y anadimos como metricas visibles:
- Matriz de confusion: muestra una matriz donde se muestran los falsos y verdaderos positivos y negativos, los vp se encontrarian de manera horizontal de la parte superior izquierda a la inferior derecha
- F1: Mide el equilibrio entre la precisión y el recall (sensibilidad) de un modelo.

In [465]:
def ajustar_y_evaluar_modelo2(X, y, modelo_deseado, grid_params, random_search=False):
    nombre = modelo_deseado[0]
    modelo = modelo_deseado[1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Si no es necesario hacer búsqueda de hiperparámetros (es decir, solo hay un valor en grid_params)
    if len(grid_params) == 1 and all(len(v) == 1 for v in grid_params.values()):
        modelo.set_params(**grid_params[nombre])
        modelo.fit(X_train, y_train)
    else:
        if random_search:
            grid_search = RandomizedSearchCV(modelo, grid_params, n_iter=10, cv=5, random_state=42, n_jobs=-1)
        else:
            grid_search = GridSearchCV(modelo, grid_params, cv=5, n_jobs=-1)
        
        grid_search.fit(X_train, y_train)
        modelo = grid_search.best_estimator_

    y_pred = modelo.predict(X_test)

    # Calcular F1-score (promedio ponderado)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # Calcular Matriz de Confusión
    confusion = confusion_matrix(y_test, y_pred)

    print(f"F1: {f1:.4f}")
    print("Matriz de Confusión:")
    print(confusion)
    print(classification_report(y_test, y_pred))

    return modelo


## 10.2. SVM

In [466]:
print(list(modelos.items())[4][0])

kNN


In [467]:
grid_params = {
    "SVM":
        {
            'C': [10],
            'kernel': ["linear"],
            'gamma': ["scale"]
        },
    "Regresion Logistica":
        {
            'C': [10],
            'max_iter': [100],
            'solver': ["liblinear"]
        },
    "Arbol de Decision":
        {
            'criterion': ["entropy"],
            'max_depth': [None],
            'min_samples_split': [2]
        },
    "Random Forest":
        {
            'n_estimators': [200],
            'max_depth': [None],
            'min_samples_split': [2]
        },
    "kNN": 
        {
            "n_neighbors": [7],
            "weights": ["distance"],
            "metric": ["manhattan"]
        }
}

for i in range(4):
    model_name = list(modelos.items())[i][0]
    print(f"========================= {model_name} =========================")
    
    ajustar_y_evaluar_modelo2(X_wine, y_wine, list(modelos.items())[i], grid_params[model_name])


F1: 0.5747
Matriz de Confusión:
[[ 0  3  3  0  0]
 [ 0 75 21  0  0]
 [ 0 31 68  0  0]
 [ 0  2 24  0  0]
 [ 0  0  2  0  0]]
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.68      0.78      0.72        96
           6       0.58      0.69      0.63        99
           7       0.00      0.00      0.00        26
           8       0.00      0.00      0.00         2

    accuracy                           0.62       229
   macro avg       0.25      0.29      0.27       229
weighted avg       0.53      0.62      0.57       229

F1: 0.6098
Matriz de Confusión:
[[ 0  3  3  0  0]
 [ 0 75 19  2  0]
 [ 0 32 64  3  0]
 [ 0  2 18  6  0]
 [ 0  0  0  2  0]]
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.67      0.78      0.72        96
           6       0.62      0.65      0.63        99
           7       0.46      0.23      0.31     

Los F1 de los modelos con este df varian entre 0.57 y 0.67, esto significa que los modelos para este dataset tienen un balance moderado entre precision y recall, y que no funciona demasiado bien que digamos

En la matriz de confusion vemos como la diagonal de vp esta "mas inflada" que el el resto de la matriz, debido a que tiene mas aciertos que fallos

# 11. Registros inventados por DataFrame

## 11.1. Generar datos ficticios

In [468]:
data = {
    'UsingIP': [0, 1, 2, 1, 1, 0, 2, 0, 1, 1],
    'LongURL': [1, 1, 0, 1, 1, 0, 1, 0, 1, 1],
    'ShortURL': [1, 0, 1, 0, 1, 1, 0, 1, 1, 0],
    'Symbol@': [1, 1, 0, 1, 0, 1, 1, 0, 0, 1],
    'Redirecting//': [1, 0, 1, 1, 0, 1, 0, 1, 0, 1],
    'PrefixSuffix-': [1, -1, 1, 0, 1, 1, 0, -1, 1, 0],
    'SubDomains': [1, 0, -1, 1, 1, 1, 0, -1, 1, 0],
    'HTTPS': [0, 1, 0, 1, 1, 0, 1, 0, 1, 1],
    'DomainRegLen': [1, 0, 1, 1, 1, 1, 0, 0, 1, 1],
    'Favicon': [0, 1, 1, 1, 0, 1, 1, 1, 1, 0],
    'NonStdPort': [1, 0, 0, 1, 0, 1, 0, 1, 0, 1],
    'HTTPSDomainURL': [0, 1, 0, 1, 1, 0, 1, 1, 0, 0],
    'RequestURL': [1, 0, 1, 1, 0, 1, 0, 1, 1, 0],
    'AnchorURL': [1, 1, 0, 1, 1, 0, 0, 1, 0, 1],
    'LinksInScriptTags': [0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
    'ServerFormHandler': [1, 0, 0, 1, 0, 1, 0, 1, 0, 1],
    'InfoEmail': [0, 1, 1, 0, 1, 0, 1, 0, 1, 0],
    'AbnormalURL': [1, 1, 0, 1, 0, 0, 1, 0, 1, 1],
    'WebsiteForwarding': [0, 1, 1, 0, 1, 1, 0, 1, 0, 1],
    'StatusBarCust': [1, 0, 1, 0, 0, 1, 1, 0, 1, 1],
    'DisableRightClick': [0, 1, 0, 1, 0, 1, 1, 0, 1, 0],
    'UsingPopupWindow': [1, 0, 1, 0, 1, 0, 0, 1, 0, 1],
    'IframeRedirection': [0, 1, 0, 0, 1, 1, 1, 0, 0, 1],
    'AgeofDomain': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0],
    'DNSRecording': [0, 1, 1, 0, 1, 1, 0, 1, 0, 1],
    'WebsiteTraffic': [1, 0, 0, 1, 1, 0, 1, 0, 1, 0],
    'PageRank': [1, 0, 1, 1, 0, 1, 0, 1, 1, 0],
    'GoogleIndex': [0, 1, 0, 1, 0, 1, 1, 1, 0, 1],
    'LinksPointingToPage': [1, 0, 1, 0, 1, 0, 1, 1, 1, 0],
    'StatsReport': [0, 1, 1, 0, 1, 0, 1, 0, 1, 1],
}
y_phishing_inventado = pd.DataFrame(data)

data = {
    "x.radius_mean": [12.11, 12.20, 7.91, 12.94, 11.43, 13.64, 13.48, 12.50, 9.05, 11.78],
    "x.texture_mean": [15.60, 12.16, 16.43, 10.47, 18.82, 18.81, 11.95, 16.07, 16.10, 18.15],
    "x.perimeter_mean": [75.44, 68.48, 67.20, 68.54, 84.38, 52.18, 96.62, 50.71, 82.36, 77.33],
    "x.area_mean": [394.57, 337.60, 378.25, 443.46, 567.73, 216.31, 509.38, 422.89, 224.05, 400.79],
    "x.smoothness_mean": [0.11, 0.14, 0.13, 0.10, 0.10, 0.15, 0.15, 0.09, 0.14, 0.13],
    "x.compactness_mean": [0.15, 0.10, 0.08, 0.07, 0.13, 0.07, 0.14, 0.13, 0.09, 0.12],
    "x.concavity_mean": [0.04, 0.03, 0.09, 0.04, 0.04, 0.02, 0.06, 0.09, 0.03, 0.08],
    "x.concave_pts_mean": [0.02, 0.03, 0.04, 0.03, 0.03, 0.04, 0.04, 0.05, 0.02, 0.03],
    "x.symmetry_mean": [0.15, 0.14, 0.14, 0.13, 0.19, 0.13, 0.11, 0.17, 0.19, 0.18],
    "x.fractal_dim_mean": [0.05, 0.10, 0.09, 0.09, 0.08, 0.06, 0.09, 0.07, 0.07, 0.07],
    "x.radius_se": [0.18, 0.19, 0.17, 0.15, 0.22, 0.14, 0.21, 0.19, 0.16, 0.18],
    "x.texture_se": [0.65, 0.61, 0.71, 0.49, 0.78, 0.69, 0.65, 0.59, 0.61, 0.57],
    "x.perimeter_se": [2.15, 1.87, 2.40, 2.12, 2.33, 1.56, 2.67, 1.96, 2.08, 2.43],
    "x.area_se": [27.23, 25.56, 32.68, 24.45, 33.90, 20.48, 35.27, 22.66, 28.88, 26.35],
    "x.smoothness_se": [0.006, 0.008, 0.005, 0.004, 0.006, 0.008, 0.009, 0.003, 0.007, 0.007],
    "x.compactness_se": [0.008, 0.005, 0.006, 0.004, 0.007, 0.007, 0.007, 0.006, 0.006, 0.007],
    "x.concavity_se": [0.02, 0.01, 0.03, 0.02, 0.02, 0.01, 0.03, 0.03, 0.02, 0.03],
    "x.concave_pts_se": [0.01, 0.02, 0.02, 0.01, 0.02, 0.01, 0.02, 0.03, 0.02, 0.02],
    "x.symmetry_se": [0.02, 0.01, 0.02, 0.01, 0.02, 0.02, 0.02, 0.03, 0.01, 0.02],
    "x.fractal_dim_se": [0.02, 0.03, 0.02, 0.01, 0.02, 0.01, 0.03, 0.02, 0.02, 0.02],
    "x.radius_worst": [16.99, 15.88, 11.23, 16.15, 18.43, 21.01, 19.85, 15.92, 16.47, 17.33],
    "x.texture_worst": [22.30, 21.54, 23.70, 21.52, 22.83, 23.30, 22.02, 22.98, 21.70, 22.43],
    "x.perimeter_worst": [114.78, 109.60, 105.62, 103.91, 122.47, 119.28, 121.17, 114.94, 118.22, 116.89],
    "x.area_worst": [1400.23, 1322.43, 1303.95, 1354.87, 1512.25, 1600.41, 1458.39, 1372.98, 1420.50, 1390.63],
    "x.smoothness_worst": [0.12, 0.14, 0.11, 0.13, 0.13, 0.12, 0.13, 0.14, 0.14, 0.13],
    "x.compactness_worst": [0.16, 0.14, 0.13, 0.15, 0.16, 0.13, 0.15, 0.14, 0.12, 0.14],
    "x.concavity_worst": [0.06, 0.04, 0.08, 0.06, 0.07, 0.04, 0.08, 0.09, 0.07, 0.07],
    "x.concave_pts_worst": [0.04, 0.04, 0.05, 0.03, 0.05, 0.04, 0.05, 0.05, 0.04, 0.04],
    "x.symmetry_worst": [0.20, 0.18, 0.22, 0.19, 0.21, 0.18, 0.19, 0.19, 0.18, 0.20],
    "x.fractal_dim_worst": [0.08, 0.07, 0.09, 0.08, 0.07, 0.06, 0.08, 0.07, 0.08, 0.07],
}
y_cancer_inventado = pd.DataFrame(data)

data = {
    "fixed acidity": [7.4, 7.8, 7.8, 11.2, 7.4],
    "volatile acidity": [0.700, 0.880, 0.760, 0.280, 0.700],
    "citric acid": [0.00, 0.00, 0.04, 0.56, 0.00],
    "residual sugar": [1.9, 2.6, 2.3, 1.9, 1.9],
    "chlorides": [0.076, 0.098, 0.092, 0.075, 0.076],
    "free sulfur dioxide": [11.0, 25.0, 15.0, 17.0, 11.0],
    "total sulfur dioxide": [34.0, 67.0, 54.0, 60.0, 34.0],
    "density": [0.99780, 0.99680, 0.99700, 0.99800, 0.99780],
    "pH": [3.51, 3.20, 3.26, 3.16, 3.51],
    "sulphates": [0.56, 0.68, 0.65, 0.58, 0.56],
    "alcohol": [9.4, 9.8, 9.8, 9.8, 9.4],
}
y_wine_inventado = pd.DataFrame(data)

data = {
    "track_artist": [74.231884, 53.042254, 57.178571, 57.699187, 83.714286],
    "track_album_name": [66.0, 67.0, 70.0, 59.0, 69.0],
    "playlist_name": [59.628571, 59.628571, 59.628571, 59.628571, 59.628571],
    "playlist_genre": [47.744870, 47.744870, 47.744870, 47.744870, 47.744870],
    "playlist_subgenre": [52.079353, 52.079353, 52.079353, 52.079353, 52.079353],
    "danceability": [0.748, 0.726, 0.675, 0.718, 0.650],
    "energy": [0.916, 0.815, 0.931, 0.930, 0.833],
    "key": [6, 11, 1, 7, 1],
    "loudness": [-2.634, -4.969, -3.432, -3.778, -4.672],
    "mode": [1, 1, 0, 1, 1],
    "speechiness": [0.0583, 0.0373, 0.0742, 0.1020, 0.0359],
    "acousticness": [0.102000, 0.072400, 0.079400, 0.028700, 0.080300],
    "instrumentalness": [0.000000, 0.004210, 0.000023, 0.000009, 0.000000],
    "liveness": [0.0653, 0.3570, 0.1100, 0.2040, 0.0833],
    "valence": [0.5180, 0.6930, 0.6130, 0.2770, 0.7250],
    "tempo": [122.036, 99.972, 124.008, 121.956, 123.976],
    "duration_ms": [194754, 162600, 176616, 169093, 189052],
    "album_age": [6.0, 6.0, 6.0, 6.0, 6.0]
}
y_spotify_inventado = pd.DataFrame(data)


In [469]:
y_inventadas = {
    "PHISHING": y_phishing_inventado,
    "CANCER": y_cancer_inventado,
    "WINE": y_wine_inventado,
    "SPOTIFY": y_spotify_inventado,
}
print(modelos_ajustados_svm)

{'PHISHING': SVC(C=10), 'CANCER': SVC(C=10, kernel='linear'), 'WINE': SVC(C=1, kernel='linear')}


## 11.1. SVM

In [470]:
for t, modelo in modelos_ajustados_svm.items():
    print(f"====================== {t} ======================")
    print(modelo.predict(y_inventadas[t]))

[ 1 -1  1  1  1  1  1  1  1  1]
[-1 -1  1 -1  1 -1  1 -1  1 -1]
[5 5 5 5 5]


## 11.2. Regresion logistica

In [471]:
for t, modelo in modelos_ajustados_lr.items():
    print(f"====================== {t} ======================")
    print(modelo.predict(y_inventadas[t]))

[1 1 1 1 1 1 1 1 1 1]
[ 1  1  1 -1  1 -1  1 -1  1  1]
[5 5 5 5 5]
[6 6 6 6 6]


## 11.3. Arboles de decision

In [472]:
for t, modelo in modelos_ajustados_dt.items():
    print(f"====================== {t} ======================")
    print(modelo.predict(y_inventadas[t]))

[ 1  1  1 -1  1  1  1 -1  1  1]
[-1  1 -1 -1 -1 -1 -1  1 -1 -1]
[5 5 5 5 5]
[6 6 7 5 6]


## 11.4. Bosques aleatorios

In [473]:
for t, modelo in modelos_ajustados_rf.items():
    print(f"====================== {t} ======================")
    print(modelo.predict(y_inventadas[t]))

[ 1  1  1  1  1  1  1 -1  1  1]
[-1 -1 -1 -1  1 -1 -1 -1 -1 -1]
[5 5 5 5 5]
[6 6 7 6 6]


## 11.5. kNN

In [474]:
for t, modelo in modelos_ajustados_knn.items():
    print(f"====================== {t} ======================")
    print(modelo.predict(y_inventadas[t]))

[1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1]
[5 5 5 6 5]
[6 6 7 6 6]
