In [2]:
import numpy as np
import pandas as pd
import csv as csv

In [3]:
def preprocess(filename):
    data_df = pd.read_csv(filename, header=0)
    data_df['Gender'] = data_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    # All missing Embarked -> just make them embark from most common place
    if len(data_df.Embarked[ data_df.Embarked.isnull() ]) > 0:
        data_df.Embarked[ data_df.Embarked.isnull() ] = data_df.Embarked.dropna().mode().values

    Ports = list(enumerate(np.unique(data_df['Embarked'])))    # determine all values of Embarked,
    Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index
    data_df.Embarked = data_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int
    dummy_embarked = pd.get_dummies(data_df['Embarked'],prefix='Embarked',drop_first=False)
    data_df = pd.concat([data_df,dummy_embarked], axis=1)
    data_df = data_df.drop(['Embarked'],axis=1)
    # All the ages with no data -> make the median of all Ages
    median_age = data_df['Age'].dropna().median()
    if len(data_df.Age[ data_df.Age.isnull() ]) > 0:
        data_df.loc[ (data_df.Age.isnull()), 'Age'] = median_age
    data_df['Age'] *= 0.1

    # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
    data_df = data_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
    return data_df

In [4]:
train_df = preprocess('train.csv')
test_df = preprocess('test.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
train_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_0,Embarked_1,Embarked_2
0,0,3,2.2,1,0,-0.048707,1,0,0,1
1,1,1,3.8,1,0,0.076277,0,1,0,0
2,1,3,2.6,0,0,-0.047390,0,0,0,1
3,1,1,3.5,1,0,0.040786,0,0,0,1
4,0,3,3.5,0,0,-0.047146,1,0,0,1
5,0,3,2.8,0,0,-0.046349,1,0,1,0
6,0,1,5.4,0,0,0.038370,1,0,0,1
7,0,3,0.2,3,1,-0.021723,1,0,0,1
8,1,3,2.7,0,2,-0.041128,0,0,0,1
9,1,2,1.4,1,0,-0.004164,0,1,0,0


In [6]:
Fmax = train_df['Fare'].max()
Fmin = train_df['Fare'].min()
Fmean = train_df['Fare'].mean()
Fmax-Fmin

512.32920000000001

In [7]:
train_df['Fare'] = (train_df['Fare']-Fmean)/(Fmax-Fmin)
train_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_0,Embarked_1,Embarked_2
0,0,3,2.2,1,0,-0.048707,1,0,0,1
1,1,1,3.8,1,0,0.076277,0,1,0,0
2,1,3,2.6,0,0,-0.04739,0,0,0,1
3,1,1,3.5,1,0,0.040786,0,0,0,1
4,0,3,3.5,0,0,-0.047146,1,0,0,1


下面来看一下测试集里的情况

In [1]:
test_df


NameError: name 'test_df' is not defined

In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
Pclass        418 non-null int64
Age           418 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          417 non-null float64
Gender        418 non-null int64
Embarked_0    418 non-null uint8
Embarked_1    418 non-null uint8
Embarked_2    418 non-null uint8
dtypes: float64(2), int64(4), uint8(3)
memory usage: 20.9 KB


In [12]:
test_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_0,Embarked_1,Embarked_2
0,3,3.45,0,0,7.8292,1,0,1,0
1,3,4.70,1,0,7.0000,0,0,0,1
2,2,6.20,0,0,9.6875,1,0,1,0
3,3,2.70,0,0,8.6625,1,0,0,1
4,3,2.20,1,1,12.2875,0,0,0,1
5,3,1.40,0,0,9.2250,1,0,0,1
6,3,3.00,0,0,7.6292,0,0,1,0
7,2,2.60,1,1,29.0000,1,0,0,1
8,3,1.80,0,0,7.2292,0,1,0,0
9,3,2.10,2,0,24.1500,1,0,0,1
