# Diplomatura en ciencia de datos, aprendizaje automático y sus aplicaciones - Edición 2023 - FAMAF (UNC)

## Aprendizaje no supervisado

### Trabajo práctico entregable - Grupo 22 - FIFA female players 2023 - Parte 2: implementación de modelos de ML

**Integrantes:**
- Chevallier-Boutell, Ignacio José
- Ribetto, Federico Daniel
- Rosa, Santiago
- Spano, Marcelo

**Seguimiento:** Meinardi, Vanesa

---

## Librerías

Inicializamos el entorno.

In [None]:
!pip install threadpoolctl -U
!pip install plotly

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go

from sklearn import decomposition, preprocessing
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows',150)
sns.set_context('talk')
sns.set_theme(style='white')

## Lectura del dataset

Cargamos el conjunto de datos procesado previamente.

In [None]:
path = 'fifa2023.csv'
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
# Como fue mencionado previamente, se decide eliminar a las arqueras de la clusterización
df_mod = df[df['pos_gral'] != 'arq'].copy()

# Se seleccionan las variables que se utilizarán para clusterizar.
vars_mod = ['crossing', 'finishing', 'heading', 'short_passing', 'volleys', 
            'marking', 'standing_tackle', 'sliding_tackle', 'acceleration', 
            'sprint', 'agility', 'balance', 'shot_power', 'stamina', 
            'long_shots', 'dribbling', 'curve', 'fk_acc', 'long_passing', 
            'ball_control', 'aggression', 'interceptions', 'positioning', 
            'vision', 'penalties', 'composure', 'ls', 'st', 'rs', 'lw', 'lf', 
            'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 
            'rm', 'ldm', 'cdm', 'rdm', 'lwb', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 
            'rb']
df_mod = df_mod[vars_mod]
df_mod.shape

In [None]:
df_mod.head()

# DBScan

In [None]:
db = DBSCAN(eps=70, min_samples=100).fit(df_mod)
labels = db.labels_

In [None]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
skill_1='ball_control'
skill_2='acceleration'

In [None]:
clusters = go.Scatter(x=df_mod[skill_1], y=df_mod[skill_2],
                           mode='markers',
                        text=df.loc[:,'name'],
                           marker=dict(
                                size=5,
                                color = labels.astype(np.float), #set color equal to a variable
                                colorscale='Portland',
                                showscale=False)
                           )

data=[clusters]

layout = go.Layout(title="Clustering DBScan",titlefont=dict(size=20),
                xaxis=dict(title=skill_1),
                yaxis=dict(title=skill_2),
                autosize=False, width=1000,height=650)

fig = go.Figure(data=data, layout=layout)

In [None]:
fig.show()

# GMM

### Probamos con 3 componentes

In [None]:
gmm = GaussianMixture(n_components = 3)
 
# Fit the GMM model for the dataset
# which expresses the dataset as a
# mixture of 3 Gaussian Distribution
gmm.fit(df_mod)
 
# Assign a label to each sample
labels = gmm.predict(df_mod)

In [None]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
skill_1='ball_control'
skill_2='acceleration'

In [None]:
clusters = go.Scatter(x=df_mod[skill_1], y=df_mod[skill_2],
                           mode='markers',
                        text=df.loc[:,'name'],
                           marker=dict(
                                size=5,
                                color = labels.astype(np.float), #set color equal to a variable
                                colorscale='Portland',
                                showscale=False)
                           )

data=[clusters]

layout = go.Layout(title="Clustering DBScan",titlefont=dict(size=20),
                xaxis=dict(title=skill_1),
                yaxis=dict(title=skill_2),
                autosize=False, width=1000,height=650)

fig = go.Figure(data=data, layout=layout)

In [None]:
fig.show()

### Vemos el porcentaje de jugadoras en cada cluster para diferentes cantidades de componentes entre 2 y 10

In [None]:
value_counts = []
n_clusters=np.arange(2, 11)
sils=[]
sils_err=[]
for n in n_clusters:
    gmm=GaussianMixture(n, n_init=2, random_state=42).fit(df_mod) 
    labels=gmm.predict(df_mod)
    value_counts.append(pd.DataFrame(labels).value_counts(normalize=True).reset_index(drop=True))

In [None]:
df_clusters = pd.DataFrame(value_counts).reset_index(drop=True)
df_clusters.insert(loc=0, column='n_components', value=np.arange(2,11))
df_clusters


Seleccionamos n=5 ya que con más componentes se encuentran clusters con menos del 10% de las jugadoras. Además, con 5 clusters no hay mucha variación en el porcentaje de jugadoras por cluster.

In [None]:
n = 5
gmm = GaussianMixture(n, n_init=2, random_state=42).fit(df_mod) 
labels = gmm.predict(df_mod)

In [None]:
df_res = df[df['pos_gral'] != 'arq'].copy()
df_res['cluster'] = labels

In [None]:
df_res.cluster.value_counts(normalize=True)

In [None]:
df_res.groupby('cluster')[vars_mod].mean()

In [None]:
sns.boxplot(x='cluster', y='overall', data=df_res)
plt.show()

In [None]:
for col in vars_mod:
    sns.boxplot(x='cluster', y=col, data=df_res)
    plt.show()

In [None]:
for n in range(5):
    print(n)
    df_c = df_res[df_res['cluster']==n]
    print(df_c.position.value_counts(normalize=True))

In [None]:
for pos in df_res.position.unique():
    print(pos)
    df_p = df_res[df_res['position']==pos]
    print(df_p.cluster.value_counts(normalize=True))

In [None]:
clusters = go.Scatter(x=df_mod[skill_1], y=df_mod[skill_2],
                           mode='markers',
                        text=df.loc[:,'name'],
                           marker=dict(
                                size=5,
                                color = labels.astype(np.float), #set color equal to a variable
                                colorscale='Portland',
                                showscale=False)
                           )

data=[clusters]

layout = go.Layout(title="Clustering GMM",titlefont=dict(size=20),
                xaxis=dict(title=skill_1),
                yaxis=dict(title=skill_2),
                autosize=False, width=1000,height=650)

fig = go.Figure(data=data, layout=layout)
fig.show()

# PCA

In [None]:
std_scale=preprocessing.StandardScaler().fit(df_mod)
X_scaled=std_scale.transform(df_mod)

In [None]:
pca=decomposition.PCA(n_components=3)

pca.fit(X_scaled) #input data is centered but not scaled for each feature before applying the SVD

# proporción de varianza
print('proporción de varianza por componente: ', pca.explained_variance_ratio_)
# proporción de varianza acumulada
print ('proporción de varianza por componente acumulada: ', pca.explained_variance_ratio_.cumsum())

X_projected=pca.transform(X_scaled) #numpy array
print ('tamaño de los datos: ', X_projected.shape)

In [None]:
pcs=pca.components_  #"composición" de las componentes

In [None]:
data=[]

for i, (x,y) in enumerate(zip(pcs[0,:],pcs[1,:])):
    graph=go.Scatter(x=[0,x],y=[0,y],text=df_mod.columns[i],
                     mode='lines+markers+text',textposition='top left',textfont=dict(family='sans serif',size=15))
    data.append(graph)

layout = go.Layout(title="ACP - Fifa Skills",titlefont=dict(size=20),
            xaxis=dict(title='Componente 1'),
            yaxis=dict(title='Componente 2'),
            autosize=False, width=1050,height=750,
            showlegend=False)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
df_projected = pd.DataFrame(X_projected)

In [None]:
df_projected['position'] = df['position']
df_projected['crack'] = df['overall'].apply(lambda x: 0 if x<80 else 1)
df_projected['cluster'] = labels

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data=df_projected, x=0, y=1, hue='cluster')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data=df_projected, x=0, y=1, hue='crack')
plt.grid()
plt.show()