#                                  数据分析报告——叶子嫣

## 环境准备与数据加载

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#加载数据
df = pd.read_csv('train_and_test2.csv')

## 数据初步检查

In [None]:
df.head()

df.info()

df.isnull().sum()

## 数据初步检查

In [None]:
df = df.rename(columns={'2urvived':'Survived'})

cols_to_keep = ['Passengerid','Age','Fare','Sex','sibsp','Parch','Pclass','Survived']
df = df[cols_to_keep]

df['Age'] = df['Age'].replace(0, np.nan)
df['Age'] = df['Age'].fillna(df['Age'].median())

df['Sex'] = df['Sex'].astype('category')
df['Pclass'] = df['Pclass'].astype('category')

## 基础统计分析

In [None]:
df.describe()

df['Sex'].value_counts()
df['Pclass'].value_counts()

survival_rate = df['Survived'].mean()
print(f'整体生存率：{survival_rate:.2%}')

pd.pivot_table(df,values='Survived',index='Pclass',columns='Sex',aggfunc='mean')

## 数据可视化

### 单变量分布

In [None]:
sns.histplot(df['Age'], bins=30,kde=True)
plt.title('乘客年龄分布')
plt.show()

sns.boxplot(x=df['Fare'])
plt.title('票价分布')
plt.show()

### 多变量分析

In [None]:
sns.boxplot(x='Pclass',y='Age',data=df)
plt.title('各舱位乘客年龄分布')
plt.show()

sns.barplot(x='Sex',y='Survived',data=df)
plt.title('性别与生存率')
plt.show()

### 相关性热力图

In [None]:
corr = df.corr(numeric_only=True)

sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('特征相关性')
plt.show()