In [73]:
#导入包
import numpy as np
import pandas as pd
#导入数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
import warnings
warnings.filterwarnings('ignore')

In [74]:
print("test_shape:",test.shape)
print('train_shape:',train.shape)

test_shape: (418, 11)
train_shape: (891, 12)


In [75]:
full = pd.DataFrame()
full = pd.concat([train,test],ignore_index=True)

In [76]:
full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 数据的补齐和基本处理

In [77]:
full.info()
#可以发现：AGE少量缺失，Cabin大量缺失，ENBARKED/FARE微量缺失

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 97.2+ KB


In [78]:
#对于数值型数据，包括FARE/AGE使用平均数填充
#对于EMBARKED，由于缺失量又很少，所以用众数填充
full['Fare'] = full['Fare'].fillna(full['Fare'].mean())
full['Age'] = full['Age'].fillna(full['Age'].mean())

In [79]:
full['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [80]:
full['Embarked'] = full['Embarked'].fillna('S')
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 97.2+ KB


In [81]:
full['Cabin'].value_counts()

C23 C25 C27        6
B57 B59 B63 B66    5
G6                 5
B96 B98            4
C78                4
                  ..
A10                1
B38                1
A11                1
A16                1
D43                1
Name: Cabin, Length: 186, dtype: int64

In [82]:
full['Cabin'][1]

'C85'

In [83]:
type(full['Cabin'])

pandas.core.series.Series

In [84]:
#我有一个大胆的想法，把CABIN取第一个首字母,因为同字母的仓位属性应当更加相近并且用'0'代表NAN
full['Cabin'] = full['Cabin'].fillna('0')
full['Cabin'] = full['Cabin'].map(lambda a:a[0])
full['Cabin'].value_counts()
#但是从结果上来看，比如“T”,"G"本来就只有一个数据，可能会比较有随机性

0    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: Cabin, dtype: int64

In [85]:
full.info()
#至此已经全部完成数据的填补

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        1309 non-null   object 
 11  Embarked     1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 97.2+ KB


## 数据分析和特征分析

In [86]:
full['Age'].head()
#age是浮点型数据，没有问题，可以直接使用

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [87]:
full['Embarked'].value_counts()

S    916
C    270
Q    123
Name: Embarked, dtype: int64

In [88]:
embarkedDf = pd.DataFrame()#建立数据框存放dummy matrix
embarkedDf = pd.get_dummies(full['Embarked'],prefix='Embarked')#predix参数用于指定类别标签
full = pd.concat([full,embarkedDf],axis=1)
full.drop('Embarked',axis=1,inplace=True)#删除‘Embarked’列

In [89]:
full['Fare'].head()

0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64

In [90]:
full['Name'].head()
#名字不是很好用，本来打算drop掉，但是看到知乎的文章，知道了；一种提取方式

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [91]:
full['Parch'].head()

0    0
1    0
2    0
3    0
4    0
Name: Parch, dtype: int64

In [92]:
full['SibSp'].head()

0    1
1    1
2    0
3    1
4    0
Name: SibSp, dtype: int64

In [93]:
full['Ticket'].head()

0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object

In [94]:
#passengerid并没有什么卵用，只是一个1~N的编号而已;ticket应该也没有大用
full.drop('Ticket',axis = 1,inplace=True)
pclassDf = pd.DataFrame()
pclassDf = pd.get_dummies(full['Pclass'],prefix='pclass')
full = pd.concat([full,pclassDf],axis=1)
full.drop('Pclass',axis=1,inplace=True)#删除‘Pclass’列

In [95]:
full['Sex'].value_counts()

male      843
female    466
Name: Sex, dtype: int64

In [96]:
full['Sex']=full['Sex'].map(lambda x: 1 if x=="male" else 0)

通过观察Name列得知，姓名包含头衔或者是称谓 例如：（Braund, Mr. Owen Harris Heikkinen, Miss. Laina Oliva y Ocana, Dona. Fermina Peter, Master. Michael J）

我们提取','后面，'.'前面的称谓作为特征

In [97]:
def get_title(name):#定义称谓提取函数
    str1=name.split(',')[1]#提取，后边的字符串
    str2=str1.split('.')[0]#提取.前面的字符串
    str3=str2.strip()#用于去除空格
    return str3
titleDf = pd.DataFrame()
titleDf['Title'] = full['Name'].map(get_title)
titleDf['Title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Major             2
Ms                2
Mlle              2
Lady              1
Capt              1
Dona              1
Jonkheer          1
Sir               1
Mme               1
the Countess      1
Don               1
Name: Title, dtype: int64

In [98]:
#头衔类别有很多，我们需要对其进行进一步的分类映射
title_mapDict = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"
                    }
titleDf['Title'] = titleDf['Title'].map(title_mapDict)
titleDf['Title'].value_counts()

Mr         757
Miss       262
Mrs        200
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64

In [99]:
titleDf = pd.get_dummies(titleDf['Title'])
titleDf.head()
full = pd.concat([full,titleDf],axis=1)
full.drop('Name',axis=1,inplace=True)

In [100]:
cabinDF = pd.DataFrame()
cabinDF = pd.get_dummies(full['Cabin'],prefix='Cabin')
full = pd.concat([full,cabinDF],axis = 1)
full.drop('Cabin',axis = 1,inplace=True)

In [101]:
full.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,...,Royalty,Cabin_0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,1,0.0,1,22.0,1,0,7.25,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,2,1.0,0,38.0,1,0,71.2833,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,1.0,0,26.0,0,0,7.925,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,4,1.0,0,35.0,1,0,53.1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,5,0.0,1,35.0,0,0,8.05,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [102]:
corrDF = full.corr()
for i in corrDF['Survived'].keys():
    corrDF['Survived'][i] = abs(corrDF['Survived'][i])
print(corrDF['Survived'].sort_values())

Embarked_Q     0.003650
PassengerId    0.005007
Cabin_G        0.016040
Cabin_A        0.022287
Cabin_T        0.026456
Officer        0.031316
Royalty        0.033391
SibSp          0.035322
Cabin_F        0.057935
Age            0.070323
Parch          0.081629
Master         0.085221
pclass_2       0.093349
Cabin_C        0.114652
Cabin_E        0.145321
Embarked_S     0.149683
Cabin_D        0.150716
Embarked_C     0.168240
Cabin_B        0.175095
Fare           0.257307
pclass_1       0.285904
Cabin_0        0.316912
pclass_3       0.322308
Miss           0.332795
Mrs            0.344935
Sex            0.543351
Mr             0.549199
Survived       1.000000
Name: Survived, dtype: float64


In [103]:
full.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,...,Royalty,Cabin_0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,1,0.0,1,22.0,1,0,7.25,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,2,1.0,0,38.0,1,0,71.2833,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,1.0,0,26.0,0,0,7.925,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,4,1.0,0,35.0,1,0,53.1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,5,0.0,1,35.0,0,0,8.05,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [104]:
#构建特征子集
full_X = pd.concat( [titleDf,#name
                     pclassDf,#pclass
                     full['Fare'],#fare
                     cabinDF,#cabin
                     embarkedDf,#embarked
                     full['Sex']#sex
                    ] , axis=1 )
full_X.head()

Unnamed: 0,Master,Miss,Mr,Mrs,Officer,Royalty,pclass_1,pclass_2,pclass_3,Fare,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Sex
0,0,0,1,0,0,0,0,0,1,7.25,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,1,0,0,1,0,0,71.2833,...,1,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,1,7.925,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,1,0,0,53.1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,8.05,...,0,0,0,0,0,0,0,0,1,1


In [105]:
sourceRow = 891
source_X = full_X.loc[0:sourceRow-1,:]
source_y = full.loc[0:sourceRow-1,'Survived']
pred_X = full_X.loc[sourceRow:,:]

In [106]:
source_X.to_csv('source_X02.csv',index = False,sep = ',')
source_y.to_csv('source_y02.csv',index = False,sep = ',',header='Survived')
pred_X.to_csv('pred_X02.csv',index = False,sep = ',')

原始：#cabin/fare/pclass/name/sex/embarked
version2： 把age&fare做了scale，由于age被舍掉，所以不会有影响，然后结果其实似乎变得差了一些

In [107]:
source_X.head()

Unnamed: 0,Master,Miss,Mr,Mrs,Officer,Royalty,pclass_1,pclass_2,pclass_3,Fare,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Sex
0,0,0,1,0,0,0,0,0,1,7.25,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,1,0,0,1,0,0,71.2833,...,1,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,1,7.925,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,1,0,0,53.1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,8.05,...,0,0,0,0,0,0,0,0,1,1
