# Librerías y configuración

In [217]:
import numpy as np
import pandas as pd
import plotly.express as px

In [218]:
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay

# Lectura de datos

In [219]:
df = pd.read_excel('Data/AllMetrics.xlsx')
df.drop(columns='Unnamed: 0',inplace=True)
df.set_index('Scopus author ID',inplace=True)
df.head()

Unnamed: 0_level_0,Most recent publication,Citations,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),Oldest publication (since 1996),CiteScore,SNIP,h-index Ponderado
Scopus author ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10038895100,2017,36,36.0,2.13,1,12.0,2017,7.4,1.45,1.0
10039007800,2018,24,24.0,3.1,1,7.0,2018,3.5,1.19,1.0
10039013100,2020,0,0.0,0.0,0,71.0,2020,2.6,1.0,0.0
10039034400,2019,25,25.0,3.72,1,5.0,2019,2.7,0.88,1.0
10039108900,2021,6,6.0,1.63,1,18.0,2021,16.8,2.29,1.0


# Selección de columnas relevantes

In [220]:
cols = ['Most recent publication','Citations','h-index Ponderado',
        'Oldest publication (since 1996)']
df.drop(columns=cols,inplace=True)
df.head()

Unnamed: 0_level_0,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),CiteScore,SNIP
Scopus author ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10038895100,36.0,2.13,1,12.0,7.4,1.45
10039007800,24.0,3.1,1,7.0,3.5,1.19
10039013100,0.0,0.0,0,71.0,2.6,1.0
10039034400,25.0,3.72,1,5.0,2.7,0.88
10039108900,6.0,1.63,1,18.0,16.8,2.29


# Clustering

In [221]:
df_cl= df.copy()

cluster = KMeans(n_clusters=5,random_state=0,n_init=10)
cluster.fit(df_cl)
df_cl['Cluster'] = cluster.labels_.astype(str)
df_cl.head()

Unnamed: 0_level_0,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),CiteScore,SNIP,Cluster
Scopus author ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10038895100,36.0,2.13,1,12.0,7.4,1.45,3
10039007800,24.0,3.1,1,7.0,3.5,1.19,3
10039013100,0.0,0.0,0,71.0,2.6,1.0,2
10039034400,25.0,3.72,1,5.0,2.7,0.88,3
10039108900,6.0,1.63,1,18.0,16.8,2.29,3


In [222]:
df_cl['Cluster'].value_counts()

2    52234
0    51468
3    29548
4     2721
1      138
Name: Cluster, dtype: int64

In [223]:
df_cl.index.size

136109

In [226]:
df_centers= pd.DataFrame(cluster.cluster_centers_, columns=df.columns)

cen_std= df_centers.copy()
cen_std = StandardScaler().fit_transform(cen_std)
cen_std = pd.DataFrame(cen_std,columns=df.columns)
cen_std.head()

Unnamed: 0,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),CiteScore,SNIP
0,-0.643139,-0.741825,1.309056,0.61197,-0.756205,-0.735441
1,1.951295,1.860698,-0.629184,-1.020199,1.703695,1.74605
2,-0.676735,-0.846454,-1.576494,1.648338,-1.037168,-1.040311
3,-0.56219,-0.478195,0.488324,-0.397866,-0.427303,-0.389963
4,-0.069231,0.205776,0.408297,-0.842243,0.516981,0.419664


In [227]:
cen_std= cen_std.reset_index().melt(id_vars="index")

In [229]:
fig1 = px.line_polar(cen_std, r="value", theta="variable", color="index", line_close=True,
                     color_discrete_sequence=px.colors.qualitative.Bold)
fig1.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



# Clustering normalizando variables

In [133]:
df_std= df.copy()

df_std = StandardScaler().fit_transform(df_std)
df_std = pd.DataFrame(df_std,columns=df.columns)
df_std.head()

Unnamed: 0,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),CiteScore,SNIP
0,1.288297,0.593097,-0.310433,-1.539218,0.392334,0.279274
1,0.706971,1.104609,-0.310433,-1.794451,-0.352526,-0.006337
2,-0.45568,-0.53012,-0.935354,1.472524,-0.524417,-0.215053
3,0.755415,1.431555,-0.310433,-1.896544,-0.505318,-0.346873
4,-0.165017,0.329431,-0.310433,-1.23294,2.187636,1.202016


In [134]:
cluster1 = KMeans(n_clusters=7,random_state=0,n_init=10)
cluster1.fit(df_std)

df_std['Cluster'] = cluster1.labels_.astype(str)
df_std.head()

Unnamed: 0,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),CiteScore,SNIP,Cluster
0,1.288297,0.593097,-0.310433,-1.539218,0.392334,0.279274,2
1,0.706971,1.104609,-0.310433,-1.794451,-0.352526,-0.006337,2
2,-0.45568,-0.53012,-0.935354,1.472524,-0.524417,-0.215053,6
3,0.755415,1.431555,-0.310433,-1.896544,-0.505318,-0.346873,2
4,-0.165017,0.329431,-0.310433,-1.23294,2.187636,1.202016,1


In [135]:
df_std['Cluster'].value_counts()

6    52999
4    52938
2    14891
0    10043
1     4584
3      558
5       96
Name: Cluster, dtype: int64

In [137]:
df_centers= pd.DataFrame(cluster1.cluster_centers_, columns=df.columns)

cen_std= df_centers.copy()
cen_std = StandardScaler().fit_transform(cen_std)
cen_std = pd.DataFrame(cen_std,columns=df.columns)
cen_std.head()

Unnamed: 0,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),CiteScore,SNIP
0,-0.573155,-0.633471,2.427521,0.625912,-0.54877,-0.504056
1,-0.345364,-0.401435,-0.321715,-0.274132,-0.010301,-0.159251
2,-0.30294,-0.258179,-0.335946,-0.766132,-0.502371,-0.462425
3,2.344066,2.233324,-0.307386,-1.243178,-0.116518,-0.232225
4,-0.630884,-0.671707,-0.266293,0.512772,-0.55693,-0.500379


In [138]:
cen_std= cen_std.reset_index().melt(id_vars="index")
fig2 = px.line_polar(cen_std, r="value", theta="variable", color="index", line_close=True)
fig2.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



# Clasificación con kNN

In [139]:
df_kNN = df_cl.copy()
df_kNN

Unnamed: 0_level_0,Citations per Publication,Field-Weighted Citation Impact,h-index,Output in Top 10% Citation Percentiles (field-weighted),CiteScore,SNIP,Cluster
Scopus author ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10038895100,36.000000,2.130,1,12.0,7.400000,1.450000,3
10039007800,24.000000,3.100,1,7.0,3.500000,1.190000,3
10039013100,0.000000,0.000,0,71.0,2.600000,1.000000,2
10039034400,25.000000,3.720,1,5.0,2.700000,0.880000,3
10039108900,6.000000,1.630,1,18.0,16.800000,2.290000,3
...,...,...,...,...,...,...,...
9943259700,1.000000,0.070,1,73.0,4.100000,1.250000,2
9943431900,0.300000,0.030,1,71.4,2.260000,0.511000,2
9943655600,7.392857,0.595,9,48.0,7.714286,1.190357,0
9943688000,1.500000,0.710,1,35.0,5.850000,1.450000,0


In [140]:
target = 'Cluster'
X = df_kNN.drop(columns=target)
Y = df_kNN[target]

## k = 3

In [168]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=.3,random_state=0,stratify=Y)
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, Y_train)

pred = neigh.predict(X_test)
scor = neigh.score(X_test,Y_test)
scor

0.9959836406827811

In [169]:
cm = confusion_matrix(Y_test, pred)
cm

array([[15366,     0,    32,    43,     0],
       [    0,    41,     0,     0,     0],
       [   34,     0, 15636,     0,     0],
       [   37,     0,     0,  8815,    13],
       [    0,     0,     0,     5,   811]])

In [215]:
px.imshow(cm,text_auto=True,color_continuous_scale=px.colors.sequential.Plotly3,
          range_color=[0,70])

## k = 5

In [165]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=.3,random_state=0,stratify=Y)
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, Y_train)

pred = neigh.predict(X_test)
scor = neigh.score(X_test,Y_test)
scor

0.9955918007493939

In [166]:
cm = confusion_matrix(Y_test, pred)
cm

array([[15366,     0,    31,    44,     0],
       [    0,    41,     0,     0,     0],
       [   31,     0, 15639,     0,     0],
       [   52,     0,     0,  8797,    16],
       [    0,     0,     0,     6,   810]])

In [167]:
px.imshow(cm,text_auto=True)

## k = 7

In [171]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=.3,random_state=0,stratify=Y)
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(X_train, Y_train)

pred = neigh.predict(X_test)
scor = neigh.score(X_test,Y_test)
scor

0.995420370778537

In [173]:
cm = confusion_matrix(Y_test, pred)
cm

array([[15365,     0,    28,    48,     0],
       [    0,    41,     0,     0,     0],
       [   34,     0, 15636,     0,     0],
       [   55,     0,     0,  8797,    13],
       [    0,     0,     0,     9,   807]])

In [174]:
px.imshow(cm,text_auto=True)