## Iris dataset clustering 실습 ##

In [None]:
# Scikit-learn에서의 간단한 k-means 알고리즘 예
# iris dataset load
# 4개의 특징 (sepal length, sepal width, petal length, petal width)
# class: setosa, versicolour, virginica
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster,datasets
iris=datasets.load_iris()
x=iris.data
y=iris.target
# dataset에 대한 기본적인 정보 파악
print(iris.DESCR)

In [None]:
# 데이터 직접 봄 (특징 4개임)
# sepal length, sepal width, petal length, petal width 
# 위의 통계가 있음 
print(x)

In [None]:
# 몇행 몇열 짜리 데이터인가? (150행 4열)
print(iris.data.shape)

In [None]:
# 4개의 특징들 확인 (sepal length, sepal width, petal length, petal width)
print(list(iris.feature_names))
# 세개의 class 이름 확인 
print(list(iris.target_names))

In [None]:
# Scikit learn의 kmeans clustering 수행, 단 K=3
k_means=cluster.KMeans(n_clusters=3)
k_means.fit(x)
# kmeans clustring 결과 (0, 1, 2)중 하나의 cluster에 속함 
print(k_means.labels_[:])

In [None]:
#실제 label
print(y)

In [None]:
# 군집 중심 (cluster center)
# 이 경우에는 특징이 4개라 각 특징에 대해서 K=3이므로 3개의 군집 중심이 생김 
k_means.cluster_centers_

In [None]:
# 군집화가 잘 되었는지 판단하는 하나의 척도로 inertia (SSE)가 쓰인다 
# Sum of squared distances of samples to their closest cluster center
# 가장 가까운 cluster center까지의 거리의 제곱의 합 
# 즉, cluster 근처에 데이터들이 많이 모여있다면 inertia값은 작게된다. 즉 군집화가 잘된것
k_means.inertia_

In [None]:
# 주어진 데이터에 대해서 최적의 cluster 개수 K를 찾기는 어렵다
# 이때 쓰이는 방법중의 하나가 elbow method인데
# elbow method is a heuristic used in determining the number of clusters in a data set
# K값을 1부터 11까지 증가시키면서 SSE 값의 변화를 보는 것이다
# SSE 값이 K값을 증가시키면서 급격하게 떨어지면서 수렴한다
# K=3, K=4 정도면 합리적인 K값으로 보인다 
# K값을 늘릴수록 SSE는 떨어지지만 K값이 너무 많으면 이 데이터에만 OVER-FITTING된 결과 
# 중심점 - 데이터 거리를 합산한 inertia(sse)를 빈 array로 생성
sse = []

# 반복문을 실행하며 군집의 개수가 1~11까지 kmeans를 실행 --> sse array에 추가
for i in range(1, 11) :
    km = cluster.KMeans(n_clusters = i)
    km.fit(iris.data)
    sse.append(km.inertia_)

# sse array 시각화   
plt.plot(range(1,11), sse, marker = 'o')
plt.xlabel('Number of clusters')
plt.ylabel('sse')
plt.tight_layout()
plt.show()

## 연습 ##

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# Generate sample data
# 총 데이터 갯수 1000개, 6개의 중심으로 부터 데이터 생성, 특징 (차원)=2, return_centers : 위치를 반환
X, y, centers = make_blobs(n_samples=1000, centers=6, n_features=2, return_centers=True)
# 생성한 군집 중심의 위치
print(centers)
plt.scatter(X[:,0], X[:,1])

In [None]:
# 위의 데이터에 대해서 K-means clustering을 수행해보고자 한다
# 1. elbow method를 사용하여 적당한 K를 추정해보자.
from sklearn import cluster,datasets
sse = []

# 반복문을 실행하며 군집의 개수가 1~11까지 kmeans를 실행 --> sse array에 추가
for i in range(1, 11) :
    km = cluster.KMeans(n_clusters = i)
    km.fit(X)
    sse.append(km.inertia_)

# sse array 시각화   
plt.plot(range(1,11), sse, marker = 'o')
plt.xlabel('Number of clusters')
plt.ylabel('sse')
plt.tight_layout()
plt.show()

In [None]:
# Scikit learn의 kmeans clustering 수행해보자 단, 방금전에 elbow method로 추정한 K값으로 
k_means=cluster.KMeans(n_clusters=6)
k_means.fit(X)
# kmeans clustring 결과 (0, 1, 2)중 하나의 cluster에 속함 

In [None]:
# 앞의 정답과 비슷한가? 확인
k_means.cluster_centers_

In [None]:
# k means로 추정한 각 cluster들
# 같은 cluster는 같은 색으로 표시 
# 여기서 c는 color 
plt.scatter(X[:,0], X[:,1],c=k_means.labels_[:] )

# 문제로 연습하기

In [None]:
# 데이터
# 남성/여성의 키와 몸무게 데이터를 일부만 가져온 데이터

from sklearn import cluster,datasets
import numpy as np

sex = ['female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','female','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','male','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','male','female','female','female','female','female','female','female','female','female','female','female','male','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','male','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female','female']
height = [54.26313333,54.61685783,54.87372753,55.14855736,55.33649241,55.6518916,55.66820212,55.73973682,55.85121382,55.97919788,56.06663635,56.07869973,56.09824578,56.10536959,56.1089021,56.15945802,56.16729919,56.44568503,56.53416581,56.54797498,56.54884308,56.63041198,56.67814049,56.73718347,56.74174112,56.75760363,56.76445645,56.78543437,56.78938641,56.81031728,56.82223984,56.85608213,56.88597105,56.94941514,56.97513323,56.97527896,56.99445626,57.02885744,57.07265625,57.10386947,57.11253942,57.13730096,57.14819808,57.15803352,57.16584006,57.20266004,57.20794645,57.2330564,57.25811736,57.27014705,57.31302352,57.31390274,57.35309276,57.37575853,57.39774037,57.44256697,57.44520357,57.4722524,57.48139209,57.54026863,57.55275371,57.55350521,57.55871033,57.59802972,57.64752149,57.65488503,57.67518109,57.69949207,57.71425655,57.74019191,57.76480789,57.78323162,57.78681941,57.79038921,57.795534,57.79853105,57.80076874,57.80732285,57.83091908,57.83726895,57.84910509,57.85111519,57.86970441,57.88297788,57.92315936,57.93667432,57.9470911,57.96193618,57.96849839,57.98151241,57.99557143,58.03217317,58.03741798,58.04838794,58.06155297,58.08709515,58.10966317,58.11787193,58.118699,78.99874235,78.62137397,78.52821043,78.46205292,78.09586747,77.54718634,77.4655691,77.44661995,77.16080089,77.1008721,77.06735502,77.00833604,76.86685496,76.84876127,76.84267883,76.80634364,76.73244646,76.71998493,76.70983486,76.69096273,76.61754594,76.60018295,76.56927352,76.53716132,76.50188334,76.49293396,76.48231871,76.47287987,76.45657808,76.45371965,76.43455871,76.29418413,76.26866786,76.23936667,76.11667468,76.02708186,76.02134748,76.01230001,75.99995705,75.99336033,75.98117435,75.94686644,75.94446038,75.88543313,75.85011222,75.82446334,75.81904028,75.75344577,75.69861825,75.694314,75.69038392,75.68280791,75.62403135,75.62019316,75.57944434,75.57243874,75.55355223,75.53722116,75.53321134,75.46499263,75.45632836,75.44105793,75.42716052,75.42368656,75.39129223,75.34158414,75.33084652,75.28284701,75.26320153,75.25945676,75.20902198,75.20597361,75.18479483,75.18126086,75.15687858,75.15541009,75.14690789,75.14082143,75.07607647,75.05966865,75.04448619,75.04399507,75.03841151,75.03388944,75.00994433,74.9917528,74.97695265,74.97523083,74.97193581,74.95506499,74.88521873,74.88030802,74.8609076,74.85316362,74.84287462,74.84149972,74.82494531,74.81405247,74.79537502,74.79328824,74.78571432,74.76744684,74.75875244,74.72413903,74.70961248,74.7069641,74.6976372,74.69552272,74.6870926,74.68636681,74.6774709,74.67043476,74.66919682,74.65054027,74.64950788,74.63382727,74.62909812,74.60466805,74.60459137,74.59473633,74.58346132,74.56873173,74.56460108,74.54513694,74.52945456,74.52097209,74.51787603,74.46498854,74.46298656,74.46069449,74.45849777,74.44660762,74.43050131,74.40818786,74.40648137,74.40319926,74.40167189,74.40120951,74.40065811,74.39745903,74.3892257,74.38801243,74.38371528,74.35044373,74.34901306,74.34848515,74.34086087,74.34020588,74.33873882,72.76190639,72.76171062,72.76032466,72.75805404,72.75521439,72.74952611,72.74691678,72.74454456,72.74220916,72.73897,72.73859251,72.73629299,72.73178767,72.7266202,72.72634178,72.72580656,72.72060324,72.71761271,72.71616603,72.71522454,72.71349076,72.70722408,72.70460442,72.69562419,72.69525855,72.69337874,72.69317546,72.68851411,72.67888504,72.67696109,72.67399365,72.66524188,72.6645041,72.6644055,72.6596287,72.65899957,72.6580716,72.65740446,72.65314462,72.63310674,72.630594,72.6291227,72.61840852,72.61703752,72.61681633,72.61446184,72.61132729,72.61046888,72.60769741,72.59766295,72.58802109,72.58737482,72.57112137,72.56422057,72.56081617,72.55969529,72.55837247,72.55682493,72.55374162,72.55018732,72.54885198,72.54599254,72.54330705,72.5418967,72.54132659,72.53935407,72.53557871,72.52166703,72.50942559,72.50812038,72.50441019,72.50075663,72.49958356,72.4995798,72.49739057,72.49517682,72.4896554,72.47299419,72.47061849,72.46702493,72.46475991,72.46152792,72.46060879,72.45463342,72.45072137,72.44884667,72.44798372,72.44634816,72.4456338,72.44521272,72.44289921,72.43950075,72.4371504,72.43632383,72.43600976,72.43516123,72.43374602,72.43261756,72.43208104,72.43046882,72.43045508,72.42977095,72.42580597,72.42276135,72.41831663,72.41697292,72.41072676,72.41058576,72.39448041,72.39432346,72.39351243,72.39194804,72.39194027,72.39065722,72.38756904,72.38609816,72.38530097,72.38449341,72.37867503,72.37219364,72.37206573,72.36993359,72.36749214,72.36522872,72.36335432,72.36211202,72.36198348,72.3595,72.35824904,60.12416782,60.12416703,60.12149408,60.12116748,60.11391973,60.1085895,60.10786148,60.10133519,60.09859133,60.09644199,60.09128734,60.0901747,60.08100686,60.07624076,60.07294329,60.06314411,60.05671104,60.05135799,60.05084636,60.04714722,60.04241965,60.04233796,60.04077935,60.03787584,60.03728766,60.03556122,60.03276917,60.03061012,60.03043377,60.02674978,60.02595007,60.0259173,60.01556232,60.01386373,60.01307771,60.01178443,60.01164973,60.0052569,60.00246348,60.00065715,59.99852911,59.99806593,59.99780556,59.9976946,59.9864955,59.98186507,59.97896224,59.97733981,59.97372202,59.96841857,59.9682045,59.95689543,59.95595817,59.94991374,59.94211972,59.93998171,59.93940471,59.93864965,59.9268571,59.922162,59.91763069,59.91307099,59.90884476,59.90854467,59.90652786,59.90353879,59.90155376,59.90027453,59.89492405,59.89389382,59.8917058,59.89139662,59.8796705,59.87867161,59.8745007,59.8735563,59.86807793,59.8648914,59.86012336,59.85024472,59.84744636,59.83992379,59.83856884,59.83771201,59.8360407,59.83026358,59.82273531,59.82171103,59.81653598,59.81591064,59.81504852,59.80953425,59.80509278,59.79858727,59.79586097,59.79002199,59.78850126,59.78676291,59.78528868,59.77645519,59.77390642,59.77315908,59.77273146,59.76682038,59.76540236,59.76500951,59.76340044,59.76289819,59.76215649,59.76169612,59.76136964,59.76093236,59.75994823,59.75844451,59.74709643,59.74227958,59.73568526,59.73527867,59.71810115,59.70873056]
weight = [64.70012671,71.39374874,78.60667031,88.81241211,88.36658258,85.62177644,68.98253009,108.1219685,103.7671373,85.41753362,89.57120474,94.48837405,104.9541004,87.29886913,80.53125938,90.81525566,77.89855927,96.64024466,97.74389648,84.87212365,90.84758938,89.48048027,97.26996668,91.60543723,103.5404881,88.88485318,79.17437583,83.99307747,95.32808768,84.17069477,101.9799235,97.36497833,99.87359265,107.1718559,89.16984997,90.34178426,84.41424571,101.2025509,93.74614216,93.5063159,98.7940381,99.10849926,91.64547294,94.26320265,106.0050311,103.9627051,99.49482376,99.37128426,101.7141821,94.49963415,93.8764374,95.1390468,72.75014469,114.1922086,106.5875627,104.4866362,89.42089489,96.31355653,87.49657111,96.19051124,98.64396312,108.1516882,105.1739234,98.18279291,99.39603077,98.87672255,125.9418654,92.22305324,94.38973264,93.65295688,89.78999844,126.1435959,106.8541166,103.3120562,112.4581679,112.3455782,86.19096018,107.7050259,83.02680254,105.7392172,96.28009269,97.04028987,93.53375287,110.7259721,100.5400428,97.70828469,91.46899763,112.2269836,118.7205901,92.11842639,95.91004289,92.79145025,100.4968306,92.32582991,113.0607203,94.94483434,96.26625015,102.0889662,116.39678,269.9896985,245.7337827,253.8890038,227.3425649,255.6908348,242.0411726,252.5566894,232.6510789,228.7073009,240.4553518,249.1102416,251.4250505,240.5367967,254.2090732,211.7241665,227.2203428,241.6866013,236.1467302,235.0354188,233.1586924,255.8633265,239.697207,229.8580951,243.2674974,217.8823464,227.139296,230.7941955,246.2323213,239.5813894,213.7323482,228.8729906,233.5038113,247.386741,229.2947458,240.6381035,232.313471,235.9260606,235.4379661,224.4404592,231.3474009,226.4740345,237.5676083,231.9247485,224.133192,227.2779654,230.4382367,230.9394542,226.5033443,249.565628,235.1366742,223.5875481,232.104476,241.2023301,226.2077799,239.3338403,217.1180153,218.8872833,212.0087426,227.9452477,217.9710527,206.5035575,230.9870216,222.1839669,220.7571144,218.6046973,218.5330155,240.4408159,219.1132095,223.4499947,227.4371382,227.5814318,228.7617806,222.475211,218.5132769,250.3171507,214.3705805,221.3292277,224.1242713,229.2336713,218.583688,222.0345647,208.526427,224.3035075,232.8861171,221.4540891,227.0100841,248.8488727,214.4954893,237.5753136,221.1594661,211.1662677,236.360678,225.2808194,229.402042,213.3348909,223.9456703,220.3363669,232.2093773,232.6354028,215.2354402,239.4643429,210.3699087,209.7235497,207.2983314,207.243854,215.2255088,209.3731259,225.5123865,230.1369233,224.630424,218.8450257,207.9080616,227.4603038,213.3205204,227.5006451,213.0690422,226.8921536,249.9462832,227.6989828,227.116582,213.2026057,234.8091796,210.6878113,218.4375679,225.5400789,216.5331913,201.9579717,213.4821026,219.8129065,217.6318664,209.9487684,213.701377,228.3751789,226.7010229,220.720177,227.8249156,228.8383958,223.8255072,240.9422666,212.6361678,210.1443121,217.1437905,213.653765,204.4129596,206.0172853,210.3690306,210.1115361,209.2375131,220.3834481,226.7361551,200.5073562,219.5212538,211.7496641,206.7109165,212.1807188,204.6143177,208.3815392,202.8736828,187.2229089,190.2827586,211.1702891,213.6839434,219.133231,218.7662937,225.9846841,206.396597,210.3653615,195.3564313,207.2465812,207.4596817,204.3651038,203.1921821,220.8663652,181.3253291,203.4535958,224.8078318,194.2417376,216.6863153,217.5614154,206.9802182,219.0951656,188.3951823,219.8772978,186.1899994,207.5063787,221.8859416,191.9288235,223.9354499,208.0662569,204.7619655,192.3289666,207.1612333,199.056247,224.8579017,205.3742526,200.6336879,216.590561,221.1755961,205.0955402,204.3431869,189.1040457,230.560535,211.7664268,208.6579459,210.8636397,198.8091869,215.2820067,222.361125,224.7474845,203.6275837,199.1688303,210.3364247,198.7339344,213.7169568,222.6671779,205.5617893,221.8626503,206.7491709,216.2182301,204.6271564,201.6550559,212.9703033,198.9399971,175.1721011,225.3785903,198.5816982,228.7496281,184.4463253,208.2945887,225.6337034,223.3155102,203.4216758,200.2900045,201.9195631,213.3595304,213.2391086,196.6322394,202.993331,209.3234214,200.8576584,230.2341099,210.7056018,211.9427178,207.6378327,201.6148974,200.0852325,204.564479,210.7229474,189.1790112,208.1415371,177.9695321,228.9746467,204.7061013,196.0285063,217.7226101,210.3640911,189.3844383,208.436509,201.41093,209.4209477,206.6727082,215.6752656,199.5740493,199.4468992,212.2125188,189.1655919,228.2098866,216.9383745,210.187294,199.3762346,224.6038107,218.3945547,195.0138368,216.0565032,207.3948305,198.2629703,213.2978873,175.1655077,109.3889284,119.0574255,110.2773991,113.4069667,127.7124528,98.08851735,122.3637894,106.8698908,122.7448855,104.4138173,106.0122164,117.8952374,123.7760181,139.0846037,126.3896126,102.9000465,112.3276295,124.6791499,115.656208,114.6035259,97.03465533,129.1415951,125.681644,115.522648,117.6058738,112.9526252,93.20410252,104.4973318,97.68743226,103.3869461,87.035416,126.4608298,117.6669536,133.3275301,113.8783529,109.0496295,113.0564267,112.6643834,114.4457208,95.62563818,96.8564881,113.9449883,113.6511113,115.8128156,129.4177641,112.9029394,114.2993711,110.8001983,111.2242901,115.4768128,103.6168184,101.5037529,119.8804371,111.2322809,123.4984918,102.4669493,110.614615,141.4595789,98.42910943,125.5471872,118.4634141,113.7224311,94.56369747,115.5259156,105.6497737,119.5808538,116.503429,118.4689471,129.1738388,116.7595601,95.63314901,108.625579,106.78231,99.87728361,131.1881479,107.6376075,117.8038417,117.7294421,121.8629072,101.7161626,112.1445442,108.1015416,106.358333,127.9477482,103.265961,99.72474197,122.1523826,139.5281168,107.5799709,130.9570276,112.0974171,109.6574367,102.1472298,102.7878264,118.7540406,128.2539231,109.8279411,116.9048715,118.3536351,120.7436378,124.4180192,110.8388613,127.0655463,111.2600362,133.4794844,106.2188465,112.741362,105.1525171,95.50033085,106.181546,103.9440187,100.362087,119.7175073,119.1647427,102.3548577,95.93328554,129.0044176,124.8426596,109.8166548,114.9366291]
hw = np.column_stack((height,weight))

In [None]:
# elbow method 실행하여 적절한 K값을 찾아내기

sse = []
# 반복문을 실행하며 군집의 개수가 1~11까지 kmeans를 실행 --> sse array에 추가하기

#-------------------------------#

# sse array 시각화하기

#-------------------------------#

In [None]:
# K-means를 수행하여, 군집 중심 찾기

#-------------------------------#

In [None]:
# k_means로 추정한 label을 데이터와 함께 시각화 해서 보여주기

#-------------------------------#