# Clustering

In [1]:
import numpy as np  # noqa: F401
import pandas as pd  # noqa: F401
import plotly.express as px  # noqa: F401
from sklearn.preprocessing import StandardScaler  # noqa: F401
from sklearn.cluster import KMeans, DBSCAN  # noqa: F401
from sklearn.metrics import silhouette_score  # noqa: F401

from utils import GRANT_DATA_CSV, GRANT_CLUSTERED_DATA_CSV  # noqa: F401

ModuleNotFoundError: No module named 'utils'

In [3]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
grant_data = pd.read_csv(GRANT_DATA_CSV)

grant_data.sample(5)

Unnamed: 0,grant_name,AGRI,ARTS,BIOC,CENG,CHEM,COMP,DENT,EART,ENER,...,MATH,MEDI,MULT,NEUR,NURS,PHAR,PHYS,PSYC,SOCI,VETE
1320,Research Promotion and Technology Transfer Center,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1542,Key Laboratory of Ecology and Energy-saving St...,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
890,"Faculty of Medicine, Chulalongkorn University",41,12,76,16,57,27,11,18,11,...,7,161,59,16,10,30,21,4,24,16
1740,Suan Sunandha Rajabhat University,2,0,1,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,1,0
1436,Universiteit Stellenbosch,0,0,0,0,1,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0


In [None]:
X = grant_data.drop(columns="grant_name")

# Take log
X = (X + 1).apply(np.log)

# Scale
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

px.scatter(X, x=X.MEDI, y= X.PHYS)

In [6]:
min_value = 2
max_value = 30

scores = []
for i in range(min_value, max_value + 1):
    model = KMeans(n_clusters=i, max_iter=3000)
    model.fit(X)
    labels = model.predict(X)
    scores.append(silhouette_score(X, labels))

px.line(
    x=np.arange(min_value, max_value + 1),
    y=scores,
    labels={
        "x": "n_clusters",
        "y": "silhouette_score"
    },
    title="Silhouette Score"
)


In [7]:
best_score = max(scores)
best_n_clusters = scores.index(max(scores)) + min_value

print(f"{best_score=}, {best_n_clusters=}")

best_score=0.8492077307993293, best_n_clusters=2


In [8]:
kmeans_model = KMeans(n_clusters=best_n_clusters, max_iter=3000)

grant_clustered_data_kmeans = grant_data.copy()
grant_clustered_data_kmeans["cluster"] = kmeans_model.fit_predict(X)

grant_clustered_data_kmeans.groupby("cluster")["grant_name"].count()

cluster
0    1840
1      23
Name: grant_name, dtype: int64

In [9]:
dbscan_model = DBSCAN(eps=0.15, min_samples=15, metric="cosine")

grant_clustered_data_dbscan = grant_data.copy()
grant_clustered_data_dbscan["cluster"] = dbscan_model.fit_predict(X)

grant_clustered_data_dbscan["cluster"].value_counts().sort_index()

cluster
-1     538
 0      80
 1      36
 2     311
 3      57
 4      85
 5      60
 6     156
 7      33
 8     134
 9      41
 10     20
 11     20
 12     46
 13     35
 14    108
 15     15
 16     18
 17     27
 18     25
 19     18
Name: count, dtype: int64

## Manually-made cluster labels

-1 - Unclustered  
0 - Materials sci, Physics and Engineering  
1 - Psychology  
2 - Biochemistry  
3 - Immunology and Microbiology  
4 - Multidisciplinary  
5 - Neuroscience  
6 - Physics  
7 - Social Sciences  
8 - Agricultural Science  
9 - Chemical Engineering  
10 - Nursing  
11 - Chemistry  
12 - Earth Science  
13 - Pharmacology  
14 - Environmental Science  
15 - Veterinary Science  
16 - Chemistry, Pharmacology and Biochemistry  
17 - Energy, Engineering  
18 - Generalists (a bit of everything, mostly medical)  
19 - Computer Science and Math  

In [14]:
grant_clustered_data_dbscan[grant_clustered_data_dbscan["cluster"] == 5].head()

Unnamed: 0,grant_name,AGRI,ARTS,BIOC,CENG,CHEM,COMP,DENT,EART,ENER,...,MEDI,MULT,NEUR,NURS,PHAR,PHYS,PSYC,SOCI,VETE,cluster
21,National Institute on Aging,0,0,8,0,1,0,0,0,0,...,8,0,1,0,0,0,0,0,0,5
25,National Eye Institute,0,0,0,0,0,0,0,0,0,...,6,0,5,0,0,0,0,0,0,5
32,National Institute of Neurological Disorders a...,0,0,0,0,0,0,0,0,0,...,4,0,3,0,1,0,0,0,0,5
36,National Institute of Child Health and Human D...,0,0,1,0,0,0,0,0,0,...,4,1,2,0,0,0,0,0,0,5
57,Michael J. Fox Foundation for Parkinson's Rese...,0,0,0,0,0,0,0,0,0,...,2,0,2,0,0,0,0,0,0,5


In [13]:
grant_clustered_data_dbscan.to_csv(GRANT_CLUSTERED_DATA_CSV, index=False)