# Clustering

## Data Prep

In [1]:
import pandas as pd

df = pd.read_csv('Data/entertainment_clean.csv')
df.head()

Unnamed: 0,name,books,tv_shows,video_games
0,Aaliyah,0.5,4.6,4.9
1,Abigail,0.0,4.5,4.8
2,Addison,0.5,4.5,5.0
3,Adeline,3.5,4.5,6.6
4,Alana,2.8,3.8,5.6


In [2]:
# Checks
# 1. Right row granularity
# 2. Columns are non-null
# 3. Columns are numeric
# 4. Feature engineering
# 5. Feature selection
# 6. Feature scaling

In [3]:
df.shape

(150, 4)

In [4]:
df.name.nunique()

150

In [5]:
df[df.isna().any(axis=1)]

Unnamed: 0,name,books,tv_shows,video_games


In [6]:
df.dtypes

name            object
books          float64
tv_shows       float64
video_games    float64
dtype: object

In [7]:
data = df.drop(columns=['name'])
data.head()

Unnamed: 0,books,tv_shows,video_games
0,0.5,4.6,4.9
1,0.0,4.5,4.8
2,0.5,4.5,5.0
3,3.5,4.5,6.6
4,2.8,3.8,5.6


In [8]:
data.describe()

Unnamed: 0,books,tv_shows,video_games
count,150.0,150.0,150.0
mean,2.993333,4.586,5.843333
std,1.917761,0.645587,0.828066
min,0.0,3.0,4.3
25%,0.8,4.2,5.1
50%,3.2,4.5,5.8
75%,4.5,4.9,6.4
max,6.2,6.6,7.9


## 1. K-Means Clustering

In [9]:
import os
os.environ["OMP_NUM_THREADS"] = "1"


#The code above fixes the following warning:

#C:\Users\Camilo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
#  warnings.warn(

In [10]:
from sklearn.cluster import KMeans

In [11]:
kmeans2 = KMeans(n_clusters=2, n_init='auto', random_state=42)

In [12]:
kmeans2.fit(data) # Initializes clustering logic

In [13]:
kmeans2.labels_

array([0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0], dtype=int32)