In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [10]:
df = pd.read_csv('prepared_to_analysis.csv', keep_default_na=False)

In [11]:
df[df['country'] == 'Namibia']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,Code,Three_Letter_Country_Code,Continent_Name,Continent_Code,pop_1998,pop_2018,Population_diff,gdp_1998,gdp_2018,GDP_diff,sub-region,Religion,latitude,longitude
109,Namibia,376,3,1,6.8,,NAM,Africa,AF,Small population,Small population,Population increase,Poor,Poor,Average growth,Sub-Saharan Africa,Christians,-22.95764,18.49041


# Scaling Data

In [12]:
scaler = StandardScaler()

In [14]:
scaled_values = scaler.fit_transform(df[['beer_servings', 'spirit_servings', 'wine_servings', 'total_litres_of_pure_alcohol']])

# Clustering

I decided to use three different clustering algorithms and for every one check 5 and 7 cluster options

## KMeans

In [17]:
kmeans5 = KMeans(n_clusters=5, random_state=0)
kmeans5.fit(scaled_values)
df['Kmeans_5'] = kmeans5.labels_

In [18]:
kmeans7 = KMeans(n_clusters=7, random_state=0)
kmeans7.fit(scaled_values)
df['Kmeans_7'] = kmeans7.labels_

## Spectral Clustering

In [20]:
spectral_5 = SpectralClustering(n_clusters=5, random_state=0)
spectral_5.fit(scaled_values)
df['Spectral_5'] = spectral_5.labels_

In [21]:
spectral_7 = SpectralClustering(n_clusters=7, random_state=0)
spectral_7.fit(scaled_values)
df['Spectral_7'] = spectral_7.labels_

## DBSCAN

In [23]:
dbscan = DBSCAN()
dbscan.fit(scaled_values)
df['dbscan'] = dbscan.labels_

# PCA

In [28]:
pca = PCA(n_components=2, random_state=0)
pca_comps = pca.fit_transform(scaled_values)


In [33]:
df = df.merge(pd.DataFrame(pca_comps, columns=['PC1', 'PC2']), left_index=True, right_index=True)

In [38]:
df.columns

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'Code', 'Three_Letter_Country_Code',
       'Continent_Name', 'Continent_Code', 'pop_1998', 'pop_2018',
       'Population_diff', 'gdp_1998', 'gdp_2018', 'GDP_diff', 'sub-region',
       'Religion', 'latitude', 'longitude', 'Kmeans_5', 'Kmeans_7',
       'Spectral_5', 'Spectral_7', 'dbscan', 'PC1', 'PC2'],
      dtype='object')

In [42]:
df = df.drop(columns=['beer_servings', 'spirit_servings', 'wine_servings', 'total_litres_of_pure_alcohol']).merge(
    pd.DataFrame(scaled_values, columns=[
                 'beer_servings', 'spirit_servings', 'wine_servings', 'total_litres_of_pure_alcohol']),
    left_index=True, right_index=True)

In [44]:
df.to_csv('to_shiny.csv')