In [2]:
import torch
import pandas as pd

In [3]:
test = pd.read_csv("input/test.csv")
test_shape = test.shape

train = pd.read_csv("input/train.csv")
train_shape = train.shape

In [16]:
# We add a new column named "Age_categories" that adds a label depending on the age of the person ex : 
# between 5 and 12, we would have the label "child", between 60 and 100, the label "senior"

def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5) # We first fill all the blank places (where there is no age) with the number -0.5
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names) # That way when we add "age_categories" it will be labeled as "missing"
    return df

cut_points = [-1,0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']

train = process_age(train,cut_points,label_names)
test = process_age(test,cut_points,label_names)

In [20]:
# We add columns to our Data that indicate "1" for existence of information and "0" for not for two main reasons : 
# 1. For the "Pclass" column, we add these dummies to remove the relationship between 1, 2 and 3 that has no meaning
# 2. We add dummies for the Sex and Age_categories columns because most machine learning algorithms can't understand 
# text labels, so we have to convert our values into numbers.

def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

train = create_dummies(train,"Pclass")
test = create_dummies(test,"Pclass")

train = create_dummies(train,"Sex")
test = create_dummies(test,"Sex")

train = create_dummies(train,"Age_categories")
test = create_dummies(test,"Age_categories")


In [1]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train[columns], train['Survived'])

NameError: name 'train' is not defined

In [None]:
holdout = test # from now on we will refer to this
               # dataframe as the holdout data

from sklearn.model_selection import train_test_split # This will enable us to split our training data into two : 
                                                     # 1. One part to train our model on (often 80% of the observations) 
                                                     # 2. One part to make predictions with and test our model 
                                                     # (often 20% of the observations)

columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

all_X = train[columns]
all_y = train['Survived']

train_X, test_X, train_y, test_y = train_test_split(
    all_X, all_y, test_size=0.2,random_state=0)

# IMPORTANT : We do not make predictions on the training data directly to avoid OVERFITTING :  it will perform well because 
# we're testing on the same data we've trained on, but then perform much worse on new, unseen data.