In [1]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

In [2]:
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [3]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

In [5]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# ticket  ticket number
# fare    客运费
# cabin   客舱号
# embarked 登船口

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
# 去除一些被认为是与分析和预测无关的列
train_df = train_df.drop(['PassengerId','Name','Ticket'], axis=1)
test_df    = test_df.drop(['Name','Ticket'], axis=1)

In [10]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [11]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,7.8292,,Q
1,893,3,female,47.0,1,0,7.0,,S
2,894,2,male,62.0,0,0,9.6875,,Q
3,895,3,male,27.0,0,0,8.6625,,S
4,896,3,female,22.0,1,1,12.2875,,S


In [13]:
train_df.isnull().any() #查看那些列中有缺失值的

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Cabin        True
Embarked     True
dtype: bool

In [20]:
print(train_df["Embarked"].unique())
print(train_df[train_df["Embarked"] == "S"].shape[0])
print(train_df[train_df["Embarked"] == "Q"].shape[0])
print(train_df[train_df["Embarked"] == "C"].shape[0])
print(train_df[train_df["Embarked"].isnull().values == True].shape[0])

['S' 'C' 'Q' nan]
644
77
168
2


In [22]:
print(644 + 77 + 168 + 2)
train_df.shape[0]

891


891

In [23]:
# 数据清洗
# in cloumn Embarked, fill the two missing values with the most occurred value, which is "S"
train_df["Embarked"] = train_df["Embarked"].fillna("S")

In [24]:
train_df.isnull().any() 

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Cabin        True
Embarked    False
dtype: bool

In [25]:
print(train_df["Cabin"].unique())

[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49' 'F4'
 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77' 'E67'
 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106' 'C65'
 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91' 'E40'
 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34' 'C104'
 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79' 'E25'
 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68' 'A10'
 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58' 'C126'
 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90' 'C45'
 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6' 'B82 B84'
 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50' 'B42' 'C148']


In [26]:
print(train_df[train_df["Cabin"].isnull().values == True].shape[0])

687


In [27]:
train_df.drop("Cabin", axis=1, inplace=True)
test_df.drop("Cabin", axis=1, inplace=True)

In [28]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [29]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [31]:
train_df.isnull().any() 

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked    False
dtype: bool

In [32]:
train_df[train_df["Age"].isnull().values == True].shape[0]

177

In [33]:
average_age_train = train_df["Age"].mean()
std_age_train = train_df["Age"].std()
count_nan_age_train = train_df["Age"].isnull().sum()

In [35]:
average_age_test    = test_df["Age"].mean()
std_age_test        = test_df["Age"].std()
count_nan_age_test  = test_df["Age"].isnull().sum()

In [36]:
print(average_age_train)
print(std_age_train)
print(count_nan_age_train)
print("--------------------")
print(average_age_test)
print(std_age_test)
print(count_nan_age_test)

29.6991176471
14.5264973323
177
--------------------
30.2725903614
14.1812092356
86


In [37]:
rand_train = np.random.randint(average_age_train - std_age_train, average_age_train + std_age_train, 
                              size=count_nan_age_train)
rand_test = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, 
                             size=count_nan_age_test)

In [38]:
train_df["Age"][np.isnan(train_df["Age"])] = rand_train
test_df["Age"][np.isnan(test_df["Age"])] = rand_test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [39]:
# convert from float to int
train_df['Age'] = train_df['Age'].astype(int)
test_df['Age']    = test_df['Age'].astype(int)

In [40]:
train_df.isnull().any()

Survived    False
Pclass      False
Sex         False
Age         False
SibSp       False
Parch       False
Fare        False
Embarked    False
dtype: bool

In [41]:
test_df.isnull().any()

PassengerId    False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare            True
Embarked       False
dtype: bool

In [43]:
# only for test_df, since there is a missing "Fare" values
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)

# convert from float to int
train_df['Fare'] = train_df['Fare'].astype(int)
test_df['Fare']    = test_df['Fare'].astype(int)

In [44]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null int64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null int64
Embarked    891 non-null object
dtypes: int64(6), object(2)
memory usage: 55.8+ KB


In [45]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            418 non-null int64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null int64
Embarked       418 non-null object
dtypes: int64(6), object(2)
memory usage: 26.2+ KB


In [46]:
test_df.isnull().any()

PassengerId    False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
dtype: bool

In [47]:
X_train = train_df.drop("Survived",axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId",axis=1).copy()

In [51]:
def trans_sex(sex):
    if sex == "male":
        return 0
    else:
        return 1
train_df["Gender"] = train_df["Sex"].apply(trans_sex)
test_df["Gender"] = test_df["Sex"].apply(trans_sex)

In [58]:
def trans_embark(embarked):
    if embarked == "S":
        return 0
    elif embarked == "Q":
        return 1
    else:
        return 2
train_df["Embarked_code"] = train_df["Embarked"].apply(trans_embark)
test_df["Embarked_code"] = test_df["Embarked"].apply(trans_embark)

In [59]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Gender,Embarked_code
0,0,3,male,22,1,0,7,S,0,0
1,1,1,female,38,1,0,71,C,1,2
2,1,3,female,26,0,0,7,S,1,0
3,1,1,female,35,1,0,53,S,1,0
4,0,3,male,35,0,0,8,S,0,0


In [60]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Gender,Embarked_code
0,892,3,male,34,0,0,7,Q,0,1
1,893,3,female,47,1,0,7,S,1,0
2,894,2,male,62,0,0,9,Q,0,1
3,895,3,male,27,0,0,8,S,0,0
4,896,3,female,22,1,1,12,S,1,0


In [62]:
train_df.drop("Sex",axis=1, inplace=True)
train_df.drop("Embarked", axis=1, inplace=True)

In [63]:
test_df.drop("Sex",axis=1, inplace=True)
test_df.drop("Embarked", axis=1, inplace=True)

In [65]:
train_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_code
0,0,3,22,1,0,7,0,0
1,1,1,38,1,0,71,1,2
2,1,3,26,0,0,7,1,0
3,1,1,35,1,0,53,1,0
4,0,3,35,0,0,8,0,0


In [66]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_code
0,892,3,34,0,0,7,0,1
1,893,3,47,1,0,7,1,0
2,894,2,62,0,0,9,0,1
3,895,3,27,0,0,8,0,0
4,896,3,22,1,1,12,1,0


In [67]:
X_train = train_df.drop("Survived",axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId",axis=1).copy()

In [68]:
# Logistic Regression

logreg = LogisticRegression()

logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

In [70]:
Y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [71]:
len(Y_pred)

418

In [72]:
test_df.shape[0]

418

In [75]:
test_df["Survived"] = Series(Y_pred)

In [76]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_code,Survived
0,892,3,34,0,0,7,0,1,0
1,893,3,47,1,0,7,1,0,0
2,894,2,62,0,0,9,0,1,0
3,895,3,27,0,0,8,0,0,0
4,896,3,22,1,1,12,1,0,1


In [77]:
result_df = test_df.drop(["Pclass", "Age", "SibSp", "Parch", "Fare", "Gender", "Embarked_code"], axis=1)

In [78]:
result_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [84]:
result_df.to_csv("./result.csv", index=False)