# Estudi Gaussian Mixture amb Modificacions

## Utilització GridSearch

In [None]:
import pandas as pd

In [None]:
# Lectura fitxer 
df_students = pd.read_csv("datasets/Data Carrard.csv")
df_students = df_students.drop('id', axis = 1)

df_categorical = df_students[['sex', 'year', 'glang', 'part', 'job', 'stud_h', 'health', 'psyt']]
df_numerical = df_students[['age', 'jspe', 'qcae_cog', 'qcae_aff', 'amsp', 'erec_mean', 'cesd', 'stai_t', 'mbi_ex', 'mbi_cy', 'mbi_ea']]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
df_num_scaled = scaler.fit_transform(df_numerical)

# Create a PCA instance: pca
pca = PCA(n_components=0.95)

# Fit the PCA instance to the scaled samples
pca.fit(df_num_scaled)

# Transform the scaled samples: pca_features
pca_features = pca.transform(df_num_scaled)

### Busquem millor model

In [None]:
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
from sklearn.model_selection import GridSearchCV

def gmm_bic_score(estimator, X):
    """Callable to pass to GridSearchCV that will use the BIC score."""
    # Make it negative since GridSearchCV expects a score to maximize
    return -estimator.bic(X)


param_grid = {
    "n_components": range(1, 7),
    "covariance_type": ["spherical", "tied", "diag", "full"],
}
grid_search = GridSearchCV(
    GaussianMixture(), param_grid=param_grid, scoring=gmm_bic_score
)

print("Millor model:", grid_search.best_params_)

### Aplicació millor model 

In [None]:
clusters = grid_search.fit_predict(pca_features) # fit

In [None]:
# scatter plot of the first two principal components
plt.scatter(pca_features[:,0], pca_features[:,1], c=clusters, cmap='viridis')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
cluster_labels = grid_search.predict(pca_features)
df_clustered = df_numerical.copy() 
df_clustered['Cluster'] = cluster_labels 

In [None]:
# Grouping the data frame by cluster to get the properties of each cluster
cluster_grouped = df_clustered.groupby('Cluster')
cluster_properties = cluster_grouped.mean()

# Printing the properties of each cluster (mean values of the variables)
print(cluster_properties)

In [None]:
# Getting the number of patients in each cluster
cluster_grouped.size()

In [None]:
cluster_properties.plot(kind='bar', figsize=(15, 10))
plt.show()

### Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(pca_features, cluster_labels, test_size=0.2, random_state=42) # 80% training and 20% test data sets

# Create a logistic regression classifier
logreg = LogisticRegression()

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

# Compute and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(accuracy))

### Avaluació model

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from scipy.spatial.distance import cdist

def elbow_report(X):
    meandist = []
    n_clusters = range(2,15)
    for n_cluster in n_clusters:
        gmm = GaussianMixture(n_components=n_cluster)
        gmm.fit(X)
        meandist.append(
          sum(
            np.min(
              cdist(X, gmm.means_, 'mahalanobis', VI=gmm.precisions_), 
              axis=1
            ) / X.shape[0]
          )
        )
    plt.plot(n_clusters,meandist,'bx-')
    plt.xlabel('Number of Clusters') 
    plt.ylabel('Mean Mahalanobis Distance') 
    plt.title('GMM Clustering for n_cluster=2 to 15')
    plt.show()

In [None]:
elbow_report(X_train)

## Utilització stud_h com a numèric

In [None]:
import pandas as pd

# Lectura fitxer 
df_students = pd.read_csv("datasets/Data Carrard.csv")
df_students = df_students.drop('id', axis = 1)

df_categorical = df_students[['sex', 'year', 'glang', 'part', 'job', 'health', 'psyt']]
df_numerical = df_students[['age', 'stud_h', 'jspe', 'qcae_cog', 'qcae_aff', 'amsp', 'erec_mean', 'cesd', 'stai_t', 'mbi_ex', 'mbi_cy', 'mbi_ea']]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
df_num_scaled = scaler.fit_transform(df_numerical)

# Create a PCA instance: pca
pca = PCA(n_components=0.95)

# Fit the PCA instance to the scaled samples
pca.fit(df_num_scaled)

# Transform the scaled samples: pca_features
pca_features = pca.transform(df_num_scaled)

### Aplicació model

In [None]:
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
n_grups = 3
model_gm = GaussianMixture(n_components=n_grups)
clusters = model_gm.fit_predict(pca_features)

In [None]:
# scatter plot of the first two principal components
plt.scatter(pca_features[:,0], pca_features[:,1], c=clusters, cmap='viridis')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
cluster_labels = model_gm.predict(pca_features)
df_clustered = df_numerical.copy() 
df_clustered['Cluster'] = cluster_labels 

In [None]:
# Grouping the data frame by cluster to get the properties of each cluster
cluster_grouped = df_clustered.groupby('Cluster')
cluster_properties = cluster_grouped.mean()

# Printing the properties of each cluster (mean values of the variables)
print(cluster_properties)

In [None]:
# Getting the number of patients in each cluster
cluster_grouped.size()

In [None]:
cluster_properties.plot(kind='bar', figsize=(15, 10))
plt.show()

### Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(pca_features, cluster_labels, test_size=0.2, random_state=42) # 80% training and 20% test data sets

# Create a logistic regression classifier
logreg = LogisticRegression()

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

# Compute and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(accuracy))

### Avaluació model

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from scipy.spatial.distance import cdist

def elbow_report(X):
    meandist = []
    n_clusters = range(2,15)
    for n_cluster in n_clusters:
        gmm = GaussianMixture(n_components=n_cluster)
        gmm.fit(X)
        meandist.append(
          sum(
            np.min(
              cdist(X, gmm.means_, 'mahalanobis', VI=gmm.precisions_), 
              axis=1
            ) / X.shape[0]
          )
        )
    plt.plot(n_clusters,meandist,'bx-')
    plt.xlabel('Number of Clusters') 
    plt.ylabel('Mean Mahalanobis Distance') 
    plt.title('GMM Clustering for n_cluster=2 to 15')
    plt.show()

elbow_report(X_train)