/
top500_plot.py
executable file
·195 lines (163 loc) · 8.17 KB
/
top500_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys, json
from itertools import cycle, product
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt, dates as mpld, use
plt.rcParams['font.size']=20
plt.rcParams['svg.fonttype'] = 'none' # don't embed or render font (https://stackoverflow.com/a/35734729)
#pl.rcParams['legend.fontsize']*=1.1
#pl.rcParams['xtick.labelsize']*=1.25
#pl.rcParams['ytick.labelsize']*=1.25
plt.rcParams['legend.fontsize']='x-small'
##########################
# get the data
df = pd.read_csv('TOP500_history.csv', low_memory=False, parse_dates={'Date': ['Year','Month','Day']})
assert (df.groupby(('Date')).size()==500).all()
# Make mostly-coherent processor family and vendor columns
def remap(procfam):
if procfam in ('Intel EM64T','Intel Nehalem','Intel Westmere','Intel SandyBridge','Intel IvyBridge','Intel Haswell','Intel Core','Intel Broadwell','Intel Skylake','Intel Cascade Lake','Intel Cascade lake','Intel Ice Lake','AMD x86_64','AMD Zen (Naples)','AMD Zen-2 (Rome)','AMD Zen-3 (Milan)'):
i,v='x86-64', procfam.split()[0]
elif procfam in ('Intel MIC','Intel Xeon Phi'):
i,v='Xeon Phi','Intel'
elif procfam in ('POWER','Power','PowerPC'):
i=v='POWER'
elif procfam == 'Intel IA-64':
i,v='Itanium', 'Intel'
elif procfam in ('Intel IA-32','AMD'):
i,v='x86-32', procfam.split()[0]
elif procfam == 'X86_64':
# AMD/Chinese "Hygon Dhyana" system which appears in 2020/11 is mis-classified
i,v='x86-64', "AMD"
else:
i,v=procfam, procfam
return pd.Series((i,v))
procfam = df['Processor Family'].where(df['Processor Family'].notna(), df['Processor Technology'])
df[['ISA','Vendor']] = procfam.apply(remap)
# get country codes
for f, t in (('Saudia Arabia', 'Saudi Arabia'), # typo in TOP500 sources
('Korea, South', 'South Korea'), # Match country-en.csv
('Czech Republic', 'Czechia'), # Match country-en.csv
('Slovak Republic', 'Slovakia'), # Match country-en.csv
('Hong Kong', 'Hong Kong SAR China')): # Match country-en.csv
df['Country'].replace(f, t, inplace=True)
dfc_en = pd.read_csv('country-en.csv')
dfc_en.columns = ('CountryISO', 'Country')
df = df.merge(dfc_en, on='Country', how='left')
assert (df['CountryISO'].isnull()==False).all()
# get localized labels and countries
loclabels = json.load(open('labels-i18n.json'))
countries = None
for lang in loclabels:
clang = pd.read_csv('country-%s.csv'%lang, encoding='utf-8')
clang.columns = ('CountryISO', lang)
if countries is None:
countries = clang
else:
countries = countries.merge(clang, on='CountryISO')
countries.set_index('CountryISO', inplace=True)
assert (df.groupby(('Date')).size()==500).all()
##########################
# Find what set of countries (sorted by weight) account for most of the total counts
country_by_date = df.groupby(['Date','CountryISO']).size()
country_wt = country_by_date.sum(level='CountryISO').sort_values(ascending=False).to_frame('sum')
#country_wt['sum'] = country_by_date.sum(level='Country')
cutoff = country_wt['sum'].cumsum() > 0.90*country_wt['sum'].sum()
#country_by_date = country_by_date.reset_index()
#country_by_date = country_by_date.groupby(('Date','Country')).sum()
country_by_date = country_by_date.unstack() # pivot Country from row to column index
country_by_date = country_by_date.fillna(0) # fill in missing values (e.g. x86_64 in 1993 ;-))
major_minor_countries = [ country_by_date.reindex(columns=country_wt.index[cutoff==polarity])
for polarity in (False,True) ]
# plot it
for lang, langlabels in loclabels.items():
colors = cycle( list('bcgmry') )
hatches = cycle(('/', '*', '\\', 'o', 'x', 'O', '.'))
print("Plotting TOP500 systems by country (%s)..." % lang)
fig = plt.figure(figsize=(14,10))
sharex = None
patches, labels = [], []
dates = country_by_date.index
for pos, cbd in enumerate(major_minor_countries):
plt.subplot(2, 1, 2-pos, sharex=sharex)
sharex = ax = fig.gca()
edge = 0
bottom = None
for pp, ser in cbd.items():
hatch = next(hatches)
facecolor = next(colors)
label = countries.loc[pp,lang]
ax.fill_between(dates, edge, edge+ser, edgecolor='k', facecolor=facecolor, hatch=hatch, label=label)
ax.xaxis.set_major_formatter(mpld.DateFormatter("%Y")) #"’%y"))
ax.xaxis.set_major_locator(mpld.YearLocator())
ax.xaxis.set_minor_locator(mpld.YearLocator(month=7))
plt.xticks(rotation='60')
patches.append( plt.Rectangle((0,0), 2, 2, edgecolor='k', facecolor=facecolor, hatch=hatch) )
labels.append(label)
edge += ser
# show legend and labels
plt.ylabel(langlabels['nsys'])
plt.ylim(bottom, min(500, edge.max() + 0.1*np.ptp(edge)))
if pos==0:
plt.xlabel(langlabels['date'])
elif pos==1:
plt.setp(ax.get_xticklabels(), visible=False)
plt.setp(ax.get_xlabel(), visible=False)
plt.title(langlabels['by_country'])
plt.legend(patches, labels, loc='upper left', bbox_to_anchor=(1.02, 1), handleheight=1.2, handlelength=3, ncol=2)
plt.subplots_adjust(left=.08, top=.92, bottom=0.12, right=0.6, hspace=0.02)
plt.xlim(dates.min(), dates.max())
plt.savefig("Countries_with_TOP500_supercomputers_%s.png"%lang, bbox_inches='tight')
plt.savefig("Countries_with_TOP500_supercomputers_%s.svg"%lang, bbox_inches='tight')
##########################
# Processor families by date, sorted by weight of ISA then by Vendor
proc_counts = df.groupby(['ISA','Vendor','Date']).size()
proc_by_date = proc_counts.unstack(level=(0,1)).fillna(0) # pivot ISA,Vendor from row to column index
proc_wt = proc_by_date.sum().to_frame() # weight (ISA,Vendor) across all dates
ISA_wt = proc_wt.sum(level='ISA') # weight by (ISA) across all dates
ISA_wt.columns = [1]
proc_wt = proc_wt.join( ISA_wt.reindex(proc_wt.index, level='ISA') )
proc_wt.sort_values([1,0], ascending=(False,False), inplace=True)
proc_by_date = proc_by_date.reindex(columns=proc_wt.index)
# plot it
for lang, langlabels in loclabels.items():
colors = cycle( list('bcgmry') )
hatches = cycle(('/', '*', '\\', 'o', 'x', 'O', '.'))
print("Plotting TOP500 systems by process family (%s)..." % lang)
fig = plt.figure(figsize=(14,10))
patches, labels = [], []
dates = proc_by_date.index
edge = 0
pplast = facecolor = bottom = None
for pp, ser in proc_by_date.items():
#print ser.shape, edge.shape, dates.shape
if isinstance(pp, str): pp=pp,
if pplast is None or pp[0]!=pplast[0]:
hatch = next(hatches)
if pplast is None or len(pp)<2 or pp[1]!=pplast[1]:
facecolor = next(colors)
label = ("%s (%s)"%pp if pp[0]!=pp[1] else pp[0])
ax = fig.gca()
ax.fill_between(dates, edge, edge+ser, edgecolor='k', facecolor=facecolor, hatch=hatch, label=label)
ax.xaxis.set_major_formatter(mpld.DateFormatter("%Y")) #"’%y"))
ax.xaxis.set_major_locator(mpld.YearLocator(month=6))
ax.xaxis.set_minor_locator(mpld.YearLocator(month=11))
plt.xticks(rotation=60)
patches.append( plt.Rectangle((0,0), 2, 2, edgecolor='k', facecolor=facecolor, hatch=hatch) )
labels.append(label)
edge += ser
pplast = pp
if bottom is None:
bottom = max(0, edge.min() - 0.1*np.ptp(edge))
# show legend and labels
plt.legend(patches, labels, loc='upper left', bbox_to_anchor=(1.02, 1), handleheight=1, handlelength=4)
plt.subplots_adjust(left=.08, top=.92, bottom=0.12, right=0.75)
plt.xlabel(langlabels['date'])
plt.ylabel(langlabels['nsys'])
plt.title(langlabels['by_procfam'])
plt.xlim(dates.min(), dates.max())#+pd.datetools.relativedelta(months=6))
plt.ylim(bottom, min(500, edge.max() + 0.1*np.ptp(edge)))
plt.savefig("Processor_families_in_TOP500_supercomputers_%s.png"%lang, bbox_inches='tight')
plt.savefig("Processor_families_in_TOP500_supercomputers_%s.svg"%lang, bbox_inches='tight')
#plt.show()