In [1]:
import pandas as pd
import numpy as np
import sys

In [3]:
train = pd.read_csv('source/train.csv')
test = pd.read_csv('source/test.csv')

#### data 구조 파악

In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Preprocessing

In [8]:
print(train.isnull().sum()) #### null값이 있는 feature: Age, Cabin, Embarked

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [9]:
print(test.isnull().sum()) #### null값이 있는 feature: Age, Fare, Cabin

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


#### Parch, SibSp feature

하나의 변수 Family로 압축

In [11]:
train['Family'] = train['SibSp'] + train['Parch'] + 1
train['Family'] = train['Family'].astype(int)

test['Family'] = test['SibSp'] + test['Parch'] + 1
test['Family'] = test['Family'].astype(int)

train = train.drop('SibSp', axis = 1)
train = train.drop('Parch', axis = 1)
test = test.drop('SibSp', axis = 1)
test = test.drop('Parch', axis = 1)

#### Ticket feature

In [12]:
##ticket number는 전혀 영향X --> just drop it!
train = train.drop('Ticket', axis = 1)
test = test.drop('Ticket', axis = 1)

#### Name feature

이름은 영향X but 이름 중간의 title은 possible! --> extract

In [14]:
title = list(set(train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())))

In [15]:
train['Title'] = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
train = train.drop('Name', axis = 1)
test['Title'] = test['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test = test.drop('Name', axis = 1)

In [16]:
## categorical value로 mapping
title_mapping= {
    "Don":        "Noble",
    "Dona":       "Noble",
    "Mr" :        "Mr",
    "Ms":         "Miss",
    "Jonkheer":   "Noble",
    "Dr":         "Officer",
    "Mlle":       "Miss",
    "Mrs" :       "Mrs",
    "Rev":        "Officer",
    "Master" :    "Master",
    "Miss" :      "Miss",
    "Mme":        "Mrs",
    "Major":      "Officer",
    "Lady" :      "Noble",
    "Sir" :       "Noble",
    "Capt":       "Officer",
    "Col":        "Officer",
    "the Countess":"Noble"
}
train.Title = train.Title.map(title_mapping)
test.Title = test.Title.map(title_mapping)

In [17]:
group_train = train.groupby(['Sex', 'Pclass', 'Title'])
train.Age = group_train.Age.apply(lambda x: x.fillna(x.median()))

group_test = test.groupby(['Sex', 'Pclass', 'Title'])
test.Age = group_test.Age.apply(lambda x: x.fillna(x.median()))

In [18]:
## discretization mapping
title_mapping2 = {
    "Mr":       0,
    "Miss":     1,
    "Mrs":      2,
    "Noble":    3,
    "Officer":  4,
    "Master":   5
}
train.Title = train.Title.map(title_mapping2)
test.Title = test.Title.map(title_mapping2)

#### Embarked feature

S가 가장많으므로 fillna(S)

In [20]:
train.Embarked = train.Embarked.fillna('S')
test.Embarked = test.Embarked.fillna('S')

In [21]:
## discretization mapping
embarked_mapping  = {
        "S": 0, 
        "C": 1, 
        "Q": 2
        }
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

#### Cabin feature

In [22]:
train['Cabin'] = train['Cabin'].str[:1]
test['Cabin'] = test['Cabin'].str[:1]
Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts()
Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts()
Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts()

In [23]:
## mapping
cabin_mapping = {
    "A": 0, 
    "B": 0.4, 
    "C": 0.8, 
    "D": 1.2, 
    "E": 1.6, 
    "F": 2, 
    "G": 2.4, 
    "T": 2.8
}
train['Cabin'] = train['Cabin'].map(cabin_mapping)
test['Cabin'] = test['Cabin'].map(cabin_mapping)

train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)


#### Fare feature

In [24]:
train['Fare'].astype(float)
train.loc[(train['Fare'] <= 17), 'Fare'] = 0
train.loc[(train['Fare'] > 17) & (train['Fare'] <= 30), 'Fare'] = 1
train.loc[(train['Fare'] > 30) & (train['Fare'] <= 100), 'Fare'] = 2
train.loc[ train['Fare'] > 100, 'Fare'] = 3

test['Fare'].astype(float)
test.loc[(test['Fare'] <= 17), 'Fare'] = 0
test.loc[(test['Fare'] > 17) & (test['Fare'] <= 30), 'Fare'] = 1
test.loc[(test['Fare'] > 30) & (test['Fare'] <= 100), 'Fare'] = 2
test.loc[ test['Fare'] > 100, 'Fare'] = 3

#### PassengerId feature

영향미치지않는 feature --> just drop it!

In [25]:
PassengerId = test['PassengerId']

train = train.drop('PassengerId', axis = 1)
test = test.drop('PassengerId', axis = 1)

#### Sex feature

In [26]:
## discretization mapping
sex_mapping = {
        "male":   0, 
        "female": 1
        }
train['Sex'] = train['Sex'].map(sex_mapping)
test['Sex'] = test['Sex'].map(sex_mapping)

#### Age feature

In [27]:
train['Age'].astype(float) ## 형변환 후 진행

train.loc[(train['Age'] <= 16), 'Age'] = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 26), 'Age'] = 1
train.loc[(train['Age'] > 26) & (train['Age'] <= 36), 'Age'] = 2
train.loc[(train['Age'] > 36) & (train['Age'] <= 50), 'Age'] = 3
train.loc[ train['Age'] > 50, 'Age'] = 4

test['Age'].astype(float)
test.loc[(test['Age'] <= 16), 'Age'] = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 26), 'Age'] = 1
test.loc[(test['Age'] > 26) & (test['Age'] <= 36), 'Age'] = 2
test.loc[(test['Age'] > 36) & (test['Age'] <= 50), 'Age'] = 3
test.loc[ test['Age'] > 50, 'Age'] = 4


# Modeling ( Logistic Regression Classifier )

In [29]:
##find w vector
## 시그모이드 함수
def sigmoid(X, w): #w = [w0, w1, w2, ~]
    z = np.dot(X, w[1:]) + w[0]
    H = 1 / ( 1.0 + np.exp(-z) ) 
    return H

In [30]:
def cost_function(y, hx): 
    m = len(y)
    J = (-1/ m) * sum( y * (np.log(hx)) + (1 - y) * (np.log(1-hx)))
    return  J

In [31]:
## 그래디언트함수
def gradient(X, y, w, alpha, iteration):
    
    cost=[]
    for i in range(iteration):
        ##compute the partiatl derivation
        hx = sigmoid(X, w)
        error = hx - y
        grad = X.T.dot(error)
        
        ##update wi
        w[0] = w[0] - alpha*error.sum()
        w[1:] = w[1:] - alpha*grad
        
        cost.append(cost_function(y, hx))
        
    return cost

In [33]:
X = train.iloc[0:, [1,2,3,4,5,6,7,8]].values
y = train.iloc[0:, 0].values

m, n = X.shape

## w -1에서 1사이의 난수발생
## w 초기값에 영향을 많이 받는다.
w=[]
for i in range(n+1):
    w.append(np.random.uniform(-1,1))
    i+=1

alpha = 0.000001 ## alpha값이 작을수록 정확도는 높지만, 프로그램 실행 시간이 많이 걸림(적당히!!)
iteration = 100000

# Prediction

In [34]:
def LinearRegressionPredict(X):
    return np.where(sigmoid(X, w) >= 0.5, 1, 0)

cost = gradient(X, y, w, alpha, iteration)


prediction_result = LinearRegressionPredict(X = test.iloc[0:, [0,1,2,3,4,5,6,7]].values)
submission = pd.DataFrame({"PassengerId" : PassengerId, "Survived": prediction_result})
submission.to_csv('titanic_submission.csv', index = False)

sys.exit

  


<function sys.exit>