In [1]:
import pandas as pd

test = pd.read_csv("KaggleData/test.csv")
test_shape = test.shape
print(test_shape)

train = pd.read_csv("KaggleData/train.csv")
train_shape = train.shape
print(train_shape)

(418, 11)
(891, 12)


In [2]:
# 4. Preparing our Data for Machine Learning
train['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [3]:
column_name = "Pclass"
df = train
dummies = pd.get_dummies(df[column_name],prefix=column_name)
dummies.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [4]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

print("Before:\n", train.head())
train = create_dummies(train,"Pclass")
test = create_dummies(test,"Pclass")
print("After:\n", train.head())

Before:
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN     

In [5]:
import matplotlib.pyplot as plt

def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1,0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']

train = process_age(train,cut_points,label_names)
test = process_age(test,cut_points,label_names)

age_cat_pivot = train.pivot_table(index="Age_categories",values="Survived")
age_cat_pivot.plot.bar()
plt.show()

train = create_dummies(train,"Sex")
test = create_dummies(test,"Sex")
train = create_dummies(train,"Age_categories")
test = create_dummies(test,"Age_categories")

<Figure size 640x480 with 1 Axes>

In [6]:
# 5. Creating our first machine learning model
from sklearn.linear_model import LogisticRegression

In [7]:
lr = LogisticRegression()
columns = ['Pclass_2', 'Pclass_3', 'Sex_male']
lr.fit(train[columns], train['Survived'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']
lr.fit(train[columns], train['Survived'])
lr.decision_function(train[columns])



array([-2.17450607,  1.99980507,  0.42457562,  2.58110369, -2.17450607,
       -2.38724499, -0.59927661, -0.70326203,  0.42457562,  1.36980989,
        1.89581965,  1.99980507, -2.17450607, -2.75580468,  0.28987178,
        0.92321511, -0.70326203, -1.30730689,  0.42457562,  0.21183669,
       -1.09456796, -1.09456796,  0.28987178, -0.017978  ,  0.35500575,
       -0.156723  , -2.38724499, -0.017978  ,  0.21183669, -2.38724499,
       -0.59927661,  2.36836476,  0.21183669, -2.04167134, -0.017978  ,
       -0.59927661, -2.38724499, -2.17450607,  0.28987178,  0.28987178,
       -0.156723  ,  1.50451373, -2.38724499,  2.97575776,  0.42457562,
       -2.38724499, -2.38724499,  0.21183669, -2.38724499,  0.28987178,
       -2.24407593, -2.17450607,  1.99980507,  1.50451373, -0.96508138,
       -0.23071693,  1.50451373, -2.17450607,  2.97575776, -2.24407593,
       -2.17450607,  1.99980507, -0.59927661, -0.70326203, -0.23071693,
       -2.38724499,  1.50451373, -2.17450607,  0.28987178, -2.17