# Clustering

In [18]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model as sk_lm
from sklearn import model_selection as sk_msel
from sklearn import metrics as sk_metrics
from sklearn import preprocessing as sk_pre
from sklearn import cluster as sk_cluster
import statsmodels.api as sm

In [38]:
df_c=pd.read_csv('data/College.csv')

In [39]:
df_c.rename(columns={'Unnamed: 0': 'University'}, inplace=True)
df_c.head()

Unnamed: 0,University,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [40]:
df_c.Private.unique()

array(['Yes', 'No'], dtype=object)

In [41]:
# Numeric encoding of Private Variable
df_c.Private=df_c.Private.map({'Yes':1, 'No':0})
df_c.Private.unique()

array([1, 0])

In [42]:
df_c.set_index('University', inplace=True)
df_c.head()

Unnamed: 0_level_0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [43]:
# Standardizing
df_c_s=pd.DataFrame(sk_pre.StandardScaler().fit_transform(df_c), index=df_c.index, columns=df_c.columns)
df_c_s.head()

Unnamed: 0_level_0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Abilene Christian University,0.612553,-0.346882,-0.321205,-0.063509,-0.258583,-0.191827,-0.168116,-0.209207,-0.746356,-0.964905,-0.602312,1.270045,-0.163028,-0.115729,1.013776,-0.867574,-0.50191,-0.318252
Adelphi University,0.612553,-0.210884,-0.038703,-0.288584,-0.655656,-1.353911,-0.209788,0.244307,0.457496,1.909208,1.21588,0.235515,-2.675646,-3.378176,-0.477704,-0.544572,0.16611,-0.551262
Adrian College,0.612553,-0.406866,-0.376318,-0.478121,-0.315307,-0.292878,-0.549565,-0.49709,0.201305,-0.554317,-0.905344,-0.259582,-1.204845,-0.931341,-0.300749,0.585935,-0.17729,-0.667767
Agnes Scott College,0.612553,-0.668261,-0.681682,-0.692427,1.840231,1.677612,-0.658079,-0.520752,0.626633,0.996791,-0.602312,-0.688173,1.185206,1.175657,-1.615274,1.151188,1.792851,-0.376504
Alaska Pacific University,0.612553,-0.726176,-0.764555,-0.780735,-0.655656,-0.596031,-0.711924,0.009005,-0.716508,-0.216723,1.518912,0.235515,0.204672,-0.523535,-0.553542,-1.675079,0.241803,-2.939613


In [44]:
df_c.shape

(777, 18)

In [53]:
# Apply K-Means Clustering with 2 Clusters
kmeans_c=sk_cluster.KMeans(n_clusters=2).fit(df_c_s)
df_c['Cluster']=kmeans_c.labels_
df_c.head()

Unnamed: 0_level_0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate,Cluster
University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60,1
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56,1
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54,1
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59,1
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15,1


In [54]:
# Subset Cluster 1 and 2
df_cluster1=df_c[df_c.Cluster==0]
df_cluster2=df_c[df_c.Cluster==1]

In [55]:
df_cluster1.shape

(213, 19)

In [56]:
df_cluster2.shape

(564, 19)

In [58]:
df_cluster1.describe()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate,Cluster
count,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0
mean,0.061033,6072.361502,4162.399061,1735.755869,22.746479,52.981221,8982.699531,2098.826291,6873.553991,3757.56338,558.164319,1707.58216,76.42723,82.323944,17.361033,14.178404,7567.239437,55.28169,0.0
std,0.239955,5440.781453,3524.144788,1261.605194,15.654933,19.548431,6469.732656,2370.29107,2403.379738,923.680268,136.677165,666.518234,12.906434,12.895562,3.813473,7.683976,2849.079095,14.591788,0.0
min,0.0,233.0,233.0,153.0,1.0,12.0,658.0,33.0,2340.0,1780.0,96.0,400.0,25.0,25.0,6.7,0.0,3605.0,10.0,0.0
25%,0.0,2409.0,1870.0,819.0,12.0,37.0,3876.0,740.0,5130.0,3110.0,500.0,1200.0,71.0,76.0,15.2,9.0,5716.0,46.0,0.0
50%,0.0,4681.0,3126.0,1472.0,19.0,52.0,7484.0,1429.0,6597.0,3706.0,556.0,1660.0,78.0,86.0,17.4,13.0,6729.0,54.0,0.0
75%,0.0,8065.0,5553.0,2400.0,27.0,65.0,12911.0,2619.0,7844.0,4351.0,618.0,2070.0,86.0,92.0,19.4,18.0,8612.0,65.0,0.0
max,1.0,48094.0,26330.0,6392.0,95.0,100.0,31643.0,21836.0,18420.0,7425.0,1125.0,4288.0,103.0,100.0,39.8,48.0,17007.0,100.0,0.0


In [59]:
df_cluster2.describe()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate,Cluster
count,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0,564.0
mean,0.978723,1841.950355,1209.255319,419.012411,29.375887,56.859929,1704.810284,385.66844,11787.824468,4584.108156,546.06383,1202.06383,71.237589,78.712766,12.854255,25.978723,10450.586879,69.308511,1.0
std,0.144433,2140.98093,1099.700559,338.420794,18.015501,19.813963,1455.994131,522.61482,3674.330077,1071.864804,174.638188,628.013457,17.243948,15.24958,3.24693,12.295485,5678.298419,16.507618,0.0
min,0.0,81.0,72.0,35.0,1.0,9.0,139.0,1.0,4400.0,2460.0,250.0,250.0,8.0,24.0,2.5,2.0,3186.0,15.0,1.0
25%,1.0,608.0,501.75,204.0,17.0,42.0,840.0,62.75,9097.5,3739.0,450.0,800.0,60.0,68.0,11.1,17.0,7431.25,58.0,1.0
50%,1.0,1124.0,846.0,324.0,25.0,55.0,1266.5,184.0,11200.0,4401.0,500.0,1100.0,73.0,81.0,12.7,25.0,8952.5,69.0,1.0
75%,1.0,2096.25,1527.5,502.5,37.0,70.0,1944.5,502.75,13962.5,5371.5,600.0,1500.0,85.0,92.0,14.5,34.0,11534.0,81.0,1.0
max,1.0,13865.0,7260.0,2505.0,96.0,100.0,12408.0,5346.0,21700.0,8124.0,2340.0,6800.0,100.0,100.0,27.8,64.0,56233.0,118.0,1.0
