# 下载泰坦尼克号数据集

In [1]:
import urllib.request
import os

In [5]:
#url = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
url = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"#设置文件下载网址
filepath = 'titanic3.xls'#设置文件存储路径
if not os.path.isfile(filepath):#如果文件不存在，则执行下载
    result = urllib.request.urlretrieve(url,filepath)
    print('downloaded:', result)

downloaded: ('titanic3.xls', <http.client.HTTPMessage object at 0x7f4324160160>)


# 使用DataFrame读取数据并进行预处理

In [6]:
import numpy as np
import pandas as pd

In [7]:
all_df = pd.read_excel(filepath)

In [8]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [10]:
cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]

In [11]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


# 使用Pandas DataFrame进行数据预处理

In [12]:
df = all_df.drop(['name'], axis=1)

In [13]:
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [14]:
age_mean = df['age'].mean() #计算age字段的平均值
df['age'] = df['age'].fillna(age_mean)#将null转换为平均值

In [16]:
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [17]:
df['sex'] = df['sex'].map({'female' : 0, 'male' : 1}).astype(int)

In [22]:
x_OneHot_df = pd.get_dummies(data=df, columns=['embarked'])

In [20]:
x_OneHot_df[: 2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1


# 将DataFrame转换为Array

In [21]:
ndarray = x_OneHot_df.values

In [23]:
ndarray.shape

(1309, 10)

In [24]:
ndarray[:2]

array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
        211.3375,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   1.    ,   0.9167,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ]])

In [25]:
label = ndarray[:, 0]
features = ndarray[:, 1:]

In [26]:
label[: 2]

array([1., 1.])

In [28]:
features[: 2]

array([[  1.    ,   0.    ,  29.    ,   0.    ,   0.    , 211.3375,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.9167,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ]])

# 将ndarray特征字段进行标准化

In [29]:
from sklearn import preprocessing

In [30]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [31]:
scaledfeatures = minmax_scale.fit_transform(features)

In [32]:
scaledfeatures[: 2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

# 将数据分为训练数据和测试数据

In [34]:
msk = np.random.rand(len(all_df)) < 0.8 #按照8:2的比例产生msk
train_df = all_df[msk] #产生80%的训练数据
test_df = all_df[~msk] #产生20%的测试数据

In [36]:
print('total:', len(all_df), 'train:', len(train_df), 'test:', len(test_df))

total: 1309 train: 1066 test: 243


In [39]:
def PreprocessData(raw_df):
    df = raw_df.drop(['name'], axis = 1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fear_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fear_mean)
    df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)
    x_OneHot_df =pd.get_dummies(data = df, columns=['embarked'])
    
    ndarray = x_OneHot_df.values
    features = ndarray[:, 1:]
    label = ndarray[:, 0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledfeatures = minmax_scale.fit_transform(features)
    
    return scaledfeatures, label

In [40]:
train_features, train_label = PreprocessData(train_df)
test_features, test_label = PreprocessData(test_df)

In [41]:
train_features[: 2]

array([[0.        , 1.        , 0.00732301, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.02092091, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [42]:
train_label[: 2]

array([1., 0.])