In [54]:
# 先看看数据集的基本信息
import pandas as pd
import numpy as np
data = pd.read_csv("./titanic/train.csv")

#print(data.head())
data['Name']
##Age 年龄
##Cabin 船舱号
##Embarked 登船港口
##Fare 票价
##Name 乘客姓名
##Parch 不同代直系亲属人数
##SibSp 同代直系亲属人数
##PassengerId 乘客ID
##Pclass 客舱等级
##Sex 性别
##Ticket 船票编号
##Survived 存活情况

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [43]:
# 删除name
data = data.drop(columns=['Name'])

In [28]:
# 首先进行数据清洗
## 缺失值处理
## 异常值处理
## 数据类型转化

In [44]:
## 缺失值处理
### 一般套路就是数值型用中位数填充，离散型用众数填充
data.info()
### 可以看出缺失值在Age，Cabin，Embarked上，后两者是分类变量，前面是数值的

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [45]:
## 对age我们用中位数填充
age=data['Age'].values.reshape(-1,1)         #取出缺失值所在列的数值，sklearn当中特征矩阵必须是二维才能传入 使用reshape(-1,1)升维

from sklearn.impute import SimpleImputer     #导入模块
imp_median=SimpleImputer(missing_values=np.nan,strategy='median')    #实例化，中值填充
imp_median=imp_median.fit_transform(age)     #fit_transform一步完成调取结果

data['Age']=imp_median       #填充好的数据传回到 data['Age']列

data['Age'].isnull().sum()   #检验是否还有空值，为0即说明空值均已被填充

0

In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [47]:
## 对于Embarked我们用众数填充
age=data['Embarked'].values.reshape(-1,1)  #取出缺失值所在列的数值，sklearn当中特征矩阵必须是二维才能传入 使用reshape(-1,1)升维

from sklearn.impute import SimpleImputer #导入模块
imp_most_frequent=SimpleImputer(missing_values=np.nan,strategy='most_frequent')  #实例化，众数填充
imp_most_frequent=imp_most_frequent.fit_transform(age)     #fit_transform一步完成调取结果

data['Embarked']=imp_most_frequent       #填充好的数据传回到 data['Age']列

data['Embarked'].isnull().sum()          #检验是否还有空值，为0即说明空值均已被填充

0

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [49]:
## 对于Cabin由于缺失值太多，我们这里先去掉他
data = data.drop(columns=['Cabin'])

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


In [52]:

## 下面做数据类型转换
### 数值型
data['PassengerId'] = data['PassengerId'].astype('float64')
data['Survived'] = data['Survived'].astype('float64')
data['Pclass'] = data['Pclass'].astype('float64')
data['SibSp'] = data['SibSp'].astype('float64')
data['Parch'] = data['Parch'].astype('float64')

In [53]:
data['Ticket']

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object