# 用聚类算法分析一下LCK春季赛数据


In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np

response = requests.get("https://lol.fandom.com/wiki/LCK/2023_Season/Spring_Season/Player_Statistics")
html_content = response.text

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')

# Find the first tbody in the HTML
tables = soup.find_all('table')


In [2]:
len(tables)

6

In [3]:
table = tables[4]

In [4]:
# Extract table headers
headers = []
for header in table.find_all('th'):
    headers.append(header.text.strip())

In [5]:
headers_test = headers[4:]
headers_test[-9] = 'Gold'
headers_test[1] = "Name"
del headers_test[0]
del headers_test[-1]
del headers_test[-1]
headers_test

['Name',
 'G',
 'W',
 'L',
 'WR',
 'K',
 'D',
 'A',
 'KDA',
 'CS',
 'CS/M',
 'Gold',
 'G/M',
 'DMG',
 'DMG/M',
 'KPAR',
 'KS',
 'GS']

In [6]:
rows = []
for row in table.find_all('tr'):
    row_data = []
    for cell in row.find_all('td'):
        row_data.append(cell.text.strip())
    if row_data:
        rows.append(row_data)

In [7]:
rows_test = rows[2:]

In [8]:
for row in rows_test:
    del row[0]
    del row[-1]
    del row[-1]

In [10]:
rows_test[0]

['Effort',
 '44',
 '13',
 '31',
 '29.5%',
 '0.41',
 '2.39',
 '5.32',
 '2.4',
 '28.5',
 '0.86',
 '7.5',
 '227',
 '4.6k',
 '139.5',
 '69.4%',
 '5%',
 '13.4%']

In [11]:
import pandas as pd

In [12]:
df = pd.DataFrame(rows_test, columns=headers_test)

In [13]:
df.head()

Unnamed: 0,Name,G,W,L,WR,K,D,A,KDA,CS,CS/M,Gold,G/M,DMG,DMG/M,KPAR,KS,GS
0,Effort,44,13,31,29.5%,0.41,2.39,5.32,2.4,28.5,0.86,7.5,227,4.6k,139.5,69.4%,5%,13.4%
1,Hena,44,13,31,29.5%,2.23,2.14,2.77,2.34,306.77,9.22,13.4,401,18.1k,544.6,60.6%,27%,23.6%
2,Karis,44,13,31,29.5%,2.45,2.18,3.0,2.5,285.57,8.58,12.6,379,17k,511.3,66.1%,29.8%,22.3%
3,Morgan,44,13,31,29.5%,1.59,2.45,2.7,1.75,281.09,8.44,12.5,374,13.3k,399.5,52.1%,19.3%,22.1%
4,UmTi,44,13,31,29.5%,1.57,2.77,4.64,2.24,181.93,5.47,10.5,315,10.7k,320.8,75.2%,19%,18.6%


now we have the data, first we need the player who paly over 10 matches. 

In [14]:
def convert_to_float(value):
    try:
        if isinstance(value, str):
            if "%" in value:
                return float(value.strip('%')) / 100
            elif "K" in value.upper():
                return float(value.upper().strip('K')) * 1000
        return float(value)
    except ValueError:
        return value

# Assuming the DataFrame is already created and named 'df'
df = df.applymap(convert_to_float)

In [15]:
df.head()

Unnamed: 0,Name,G,W,L,WR,K,D,A,KDA,CS,CS/M,Gold,G/M,DMG,DMG/M,KPAR,KS,GS
0,Effort,44.0,13.0,31.0,0.295,0.41,2.39,5.32,2.4,28.5,0.86,7.5,227.0,4600.0,139.5,0.694,0.05,0.134
1,Hena,44.0,13.0,31.0,0.295,2.23,2.14,2.77,2.34,306.77,9.22,13.4,401.0,18100.0,544.6,0.606,0.27,0.236
2,Karis,44.0,13.0,31.0,0.295,2.45,2.18,3.0,2.5,285.57,8.58,12.6,379.0,17000.0,511.3,0.661,0.298,0.223
3,Morgan,44.0,13.0,31.0,0.295,1.59,2.45,2.7,1.75,281.09,8.44,12.5,374.0,13300.0,399.5,0.521,0.193,0.221
4,UmTi,44.0,13.0,31.0,0.295,1.57,2.77,4.64,2.24,181.93,5.47,10.5,315.0,10700.0,320.8,0.752,0.19,0.186


In [334]:
df = df[df['G'] >= 10]

In [19]:
# 去掉所有和场数相关的数据，和名字
columns_to_drop = df.columns[:5]

# Drop the first four columns
df_pure_data = df.drop(columns=columns_to_drop)

# Display the updated DataFrame
df_pure_data.head()

Unnamed: 0,K,D,A,KDA,CS,CS/M,Gold,G/M,DMG,DMG/M,KPAR,KS,GS
0,0.41,2.39,5.32,2.4,28.5,0.86,7.5,227.0,4600.0,139.5,0.694,0.05,0.134
1,2.23,2.14,2.77,2.34,306.77,9.22,13.4,401.0,18100.0,544.6,0.606,0.27,0.236
2,2.45,2.18,3.0,2.5,285.57,8.58,12.6,379.0,17000.0,511.3,0.661,0.298,0.223
3,1.59,2.45,2.7,1.75,281.09,8.44,12.5,374.0,13300.0,399.5,0.521,0.193,0.221
4,1.57,2.77,4.64,2.24,181.93,5.47,10.5,315.0,10700.0,320.8,0.752,0.19,0.186


In [20]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Assuming the DataFrame is already created and named 'df'

# Remove any rows with missing values
df_pure_data = df_pure_data.dropna()

# Scale the data to have zero mean and unit variance
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_pure_data)

In [21]:
# Set the number of clusters
n_clusters = 5

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, max_iter=2000)
kmeans.fit(scaled_data)

# Get the cluster labels for each data point
cluster_labels = kmeans.labels_

In [22]:
df['Cluster'] = cluster_labels

In [28]:
df[df['Cluster'] == 0]['Name']

7          Deft
9     ShowMaker
17        Chovy
21         Peyz
25        Viper
27       Aiming
28          Bdd
48       Callme
49        Faker
50     Gumayusi
Name: Name, dtype: object

这个算法并没有将ad和mid分开，几乎都是排名前五的中下，除了：
1. Viper：帮自家中单擦屁股
2. Callme：战队排名倒数第一，纯纯院长

In [24]:
df[df['Cluster'] == 1]['Name']

4         UmTi
6       Canyon
14       Croco
15       Juhan
20      Peanut
22        Clid
29        Cuzz
35    YoungJae
42      Willer
45      Sylvie
52        Oner
Name: Name, dtype: object

都是打野

In [25]:
df[df['Cluster'] == 2]['Name']

1        Hena
2       Karis
3      Morgan
5       Canna
11       FATE
12     Rascal
13    deokdam
16     Pleata
19      Doran
23     Kingen
26       ZEKA
30       Kiin
32    BuLLDoG
33       DuDu
34    Taeyoon
38     Burdol
39     Clozer
40      Envyy
43       DnDn
46      vital
47     FIESTA
53       Zeus
Name: Name, dtype: object

AD and MID, standing:
倒数后五名的队伍，ad和中单的分类和上单一样。。。
1. Hena, Karis, 
2. deokdam, FATE
3. Taeyoon, BuLLDoG
4. Envyy, Clozer
5. vital, FIESTA

TOP:
1. Morgan
2. Canna
3. Rascal
4. Doran
5. Kingen
6. Kiin
7. DuDu
8. Zeus
9. DnDn

除了ZEKA，纯纯战犯

In [29]:
df[df['Cluster'] == 3]['Name']

8      Kellin
18    Delight
51      Keria
Name: Name, dtype: object

。。。这三个辅助能单独列出一档啊，离谱。。。

In [30]:
df[df['Cluster'] == 4]['Name']

0      Effort
10      BeryL
24       Life
31    Lehends
36        Jun
37      Moham
41       Kael
44      Peter
Name: Name, dtype: object

都是常规辅助，如果数据包括CP（英雄选用），Lehends 和 Kael能排到前一档。