In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
from platform import python_version
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier,AdaBoostClassifier
from sklearn.svm import NuSVC
from sklearn.tree import ExtraTreeClassifier
from sklearn.metrics import classification_report,accuracy_score,recall_score
from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score

In [None]:
library ={
    'Pandas': pd,
    'Matplotlib': matplotlib,
    'Seaborn': sns,
    'Numpy': np,
    'Scikit-Learn': sklearn
}
print('Library Version:\n')
print(f"{'':-^20}|{'':-^10}")
print(f"{'Library':^20}|{'Version':^10}")
print(f"{'':-^20}|{'':-^10}")

for name, lib in sorted(library.items()):
    print(f"{name:<20}l{lib.version :>10}")

print(f'\nPython Version: {python_version()}')

In [3]:
# 导入数据
df = pd.read_csv('./injury_data.csv')
df.head()

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time,Likelihood_of_Injury
0,24,66.251933,175.732429,1,0.457929,5,0
1,37,70.996271,174.58165,0,0.226522,6,1
2,32,80.093781,186.329618,0,0.61397,2,1
3,28,87.473271,175.50424,1,0.252858,4,1
4,25,84.65922,190.175012,0,0.577632,1,1


In [7]:
df['Player_Weight'] = df['Player_Weight'].round(2)
df['Player_Height'] = df['Player_Height'].round(2)
df['Training_Intensity'] = df['Training_Intensity'].round(2)
df.head()

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time,Likelihood_of_Injury
0,24,66.25,175.73,1,0.46,5,0
1,37,71.0,174.58,0,0.23,6,1
2,32,80.09,186.33,0,0.61,2,1
3,28,87.47,175.5,1,0.25,4,1
4,25,84.66,190.18,0,0.58,1,1


In [8]:
df_info = pd.DataFrame(df.dtypes, columns = ['Dtype'])
df_info['Unique'] = df.nunique().values
df_info['Null'] = df.isnull().sum().values
df_info

Unnamed: 0,Dtype,Unique,Null
Player_Age,int64,22,0
Player_Weight,float64,863,0
Player_Height,float64,875,0
Previous_Injuries,int64,2,0
Training_Intensity,float64,101,0
Recovery_Time,int64,6,0
Likelihood_of_Injury,int64,2,0


In [14]:
# df Describe 让计算结果保留两位小数
with pd.option_context(
    'display.float_format', '{:.2f}'.format, # 添加千分位分隔符
    'display.max_columns', None, # 显示所有列
    'display.precision', 2 # 双重确保小数精度
):
    display(df.describe())

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time,Likelihood_of_Injury
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,28.23,74.79,179.75,0.52,0.49,3.47,0.5
std,6.54,9.89,9.89,0.5,0.29,1.7,0.5
min,18.0,40.19,145.29,0.0,0.0,1.0,0.0
25%,22.0,67.95,173.03,0.0,0.24,2.0,0.0
50%,28.0,75.02,180.03,1.0,0.48,4.0,0.5
75%,34.0,81.3,186.56,1.0,0.73,5.0,1.0
max,39.0,104.65,207.31,1.0,1.0,6.0,1.0


In [None]:
# 创建新列, 一列为BMI, 林外一列年龄类别
# BMI是一种用来评估一个人的体重与身高之间的关系是否健康的指标: 体重(千克)/(身高(米))²
# 在现实生活中 BMI 并不是评估运动员是否健康的最佳身体状态的最佳指标, 因为这个指标并没有考虑到运动员的身体组成,
# 比如强健的肌肉, 肌肉质量, 骨密度等, 都有可能被评估为超重

In [15]:
# 创建一列来计算BMI
df['BMI'] = df['Player_Weight']/(df['Player_Height']/100**2)

# 将BMI进行归类
gaps = [-float('inf'), 18.5, 24.9, 29.9, 34.9, 39.9, float('inf')]
categories = ['Underweight', 'Normal', 'Overweight', 'Obesity I', 'Obesity II', 'Obesity III']

# 创建BMI的分类列 ('BMI_classification')
df['BMI_Classification'] = pd.cut(df['BMI'], bins= gaps, labels= categories, right= False)
df.head()

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time,Likelihood_of_Injury,BMI,BMI_Classification
0,24,66.25,175.73,1,0.46,5,0,3769.98805,Obesity III
1,37,71.0,174.58,0,0.23,6,1,4066.903425,Obesity III
2,32,80.09,186.33,0,0.61,2,1,4298.287984,Obesity III
3,28,87.47,175.5,1,0.25,4,1,4984.045584,Obesity III
4,25,84.66,190.18,0,0.58,1,1,4451.572195,Obesity III


In [16]:
# 找出数据集中运动员的最小的年龄和最大的年龄
print('最小的年龄为: {}岁, 最大的年龄为: {}岁'.format(df['Player_Age'].min(), df['Player_Age'].max()))

最小的年龄为: 18岁, 最大的年龄为: 39岁


In [18]:
# 创建年龄分组列
df['Age_Group'] = pd.cut(df['Player_Age'], bins=[18, 22, 26, 30, 34, df['Player_Age'].max()],
                         labels=['18-22', '22-26', '26-30', '30-34', '35+'], include_lowest= True)
df.head()

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time,Likelihood_of_Injury,BMI,BMI_Classification,Age_Group
0,24,66.25,175.73,1,0.46,5,0,3769.98805,Obesity III,22-26
1,37,71.0,174.58,0,0.23,6,1,4066.903425,Obesity III,35+
2,32,80.09,186.33,0,0.61,2,1,4298.287984,Obesity III,30-34
3,28,87.47,175.5,1,0.25,4,1,4984.045584,Obesity III,26-30
4,25,84.66,190.18,0,0.58,1,1,4451.572195,Obesity III,22-26
