# Unsupervised learning HANDS ON!

Here an example on how to generate data in classes, and work on the data directly via their classes. 

#### Let's get back the class from the previous example

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.colors as colors
import textwrap
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import silhouette_score, contingency_matrix
    
   
class Data_generator():
    def __init__(self, description='Data generator'):
        """Initializes the class creating the required attributes."""
        self.description = description    
        self.df = None 

    def load_well(self, data_file="xeek_train_subset_mini.csv", well_name="16/10-1"):
        """Read a dataset.""" 
        self.df = pd.read_csv(data_file)
        self.df = self.df[self.df["WELL"] == well_name] # select data only for the specific well

    def remove_nan(self):
        self.df.dropna(inplace=True)
    
    def add_noise(self, properties=["RHOB", "GR"], level=0.1):
        """Adds noise to the data."""
        # Noise for GR is different than that for RHOB due to the usual values of those two measurements. RHOB usually varies between 1.9-2.9 g/cm3 wheres GR between 5-300.
        for property in properties:
            if property == "RHOB":
                range = [-level, level]
            if property == "GR":
                range = [-level*50, level*50]

            noise = np.random.uniform(low=range[0], high=range[1], size=len(self.df[property]))
            self.df[property] += noise
    
    def plot_me(self, x_lab='RHOB', y_lab='GR', legend=''):
        """Plots datasets."""
        plt.scatter(self.df[x_lab], self.df[y_lab], label=legend)
        plt.xlabel(x_lab)
        plt.ylabel(y_lab)
        plt.legend()

    def get_data(self):
        return self.df
    
   
class Model_generator():
    def __init__(self, description='Model generator', data=None):
        """Initializes the class creating the required attributes."""
        self.description = description    
        self.df = data
        
    def optimise_k_means(self, max_k):
        """Plots number of clusters vs the inertia which helps to determine the optimal number of clusters to be used"""
        #The user can choose to run the original data with no noise or the one with added noise
        means = []
        inertias = []
        
        # Applies kmeans over a range of different cluster numbers limited by max_k
        for k in range(1, max_k):
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(self.df[["GR", "RHOB"]].dropna(inplace=True))
            means.append(k)
            inertias.append(kmeans.inertia_)

        # Plotting parameters   
        plt.subplots(figsize=(10, 5))
        plt.plot(means, inertias, 'o-')
        plt.xlabel("Number of Clusters")
        plt.ylabel("Inertia")
        plt.grid(True)
        plt.show()

    
    def k_means_gmm(self, n_clusters):
        """Applies kmeans and gmm models and adds the created labels to a df with other curves""" 

        # Apply K-means clustering
        kmeans = KMeans(n_clusters) 
        kmeans.fit(self.df[['GR', 'RHOB', 'NPHI', 'DTC']].dropna(inplace=True))
        labels_kmeans = kmeans.labels_
        df_w_labels['KMEANS'] = labels_kmeans #add kmeans labels to the df

        #A pply Gaussian Mixture Model
        gmm = GaussianMixture(n_clusters)
        gmm.fit(self.df[['GR', 'RHOB', 'NPHI', 'DTC']].dropna(inplace=True))
        labels_gmm = gmm.predict(self.df[['GR', 'RHOB', 'NPHI', 'DTC']].dropna(inplace=True))
        df_w_labels['GMM'] = labels_gmm #add GMM labels to the df

        return df_w_labels #returns df with the added kmeans and gmm labels
    

    
class Plotter_generator():
    
    def __init__(self, description='Plot generator', data=None):
        """Initializes the class creating the required attributes."""
        self.description = description    
        self.df = data
        
        
    def crossplots(self, dataset, hue='KMEANS'):
        """Creates seaborn pairplot coloured by hue labels"""
        sns.pairplot(dataset, vars=['GR', 'RHOB','NPHI', 'DTC'], 
                     hue=hue, palette='Dark2', 
                     diag_kind='kde', 
                     plot_kws = {'s': 15, 'marker':'o', 'alpha':1})




        

In [9]:
#Create the class, add noise and plot data with and without noise for visualization
well_data = Data_generator()
well_data.load_well() 
well_data.plot_me(legend='Original')

well_data.add_noise(level=0.3)
well_data.plot_me(legend='With noise')

FileNotFoundError: [Errno 2] No such file or directory: 'xeek_train_subset_mini.csv'

In [7]:
#Find out what is the optimal number of clusters
well_model = Model_generator(data=well_data.get_data())
well_model.optimise_k_means(max_k=16)

TypeError: 'NoneType' object is not subscriptable

In [None]:
df_w_labels = well_data.k_means_gmm(n_clusters=5, dataset="with_noise") #apply kmeans and gmm and get the df with the labels
#Vertical depth plot
well_data.create_depth_plot(df_w_labels)

In [None]:
#Assess the effect of noise on the clustering accuracy
well_data.plot_silhouette_score(5, max_noise=10, noise_step=0.5)

In [3]:
#Plot pairplots with kmeans labels
well_data.crossplots_kmm(df_w_labels)

AttributeError: 'Data_generator' object has no attribute 'crossplots_kmm'

In [4]:
#Plot pairplots with gmm labels
well_data.crossplots_gmm(df_w_labels)

AttributeError: 'Data_generator' object has no attribute 'crossplots_gmm'