In [1]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
next_campaign = False

In [2]:
# FEATURES
#df  <- data frame
#col <- column
#cat <- category


# add column to data frame
def add_col (df, col, cat):
    new_col = col + "_" + str(cat)
    #print("ADD COL: " + new_col)
    if str(cat) != 'nan':
        df[new_col] = df[col].map(lambda a: a == cat,na_action=True).map({True: 1.0,False: 0.0})
    return df


#split column with mamy string categories col[A,B,C...] to many binary columns col_A[0,1] col_B[0,1] ...
def split_col (df, col):
    categories = df[col].unique()
    for cat in categories :
        df = add_col(df, col, cat)
    del df[col]
    return df


#scale column values to <0, 1> interval
def scale_col (df, col):
    
    max_val = float(df[col].max())
    min_val = float(df[col].min())
    print ( col,min_val, max_val  )
    df[col] = df[col].map(lambda a: min_val if pd.isnull(a) else a)
    df[col] = df[col].map(lambda a: abs(float(a) - min_val) / abs(max_val - min_val) )
    print(df[col].unique())
    return df


def add_nan_col (df, col):
    new_col = col + "_nan"
    df[new_col] = df[col].map (lambda a: 1.0 if pd.isnull(a) else 0.0)
    return df


def print_data(df):
    for col in df.columns:
        print (format (col, '20'), format(len(df[col].unique()), '10'), df[col].dtype)
        if len(df[col].unique()) < 20:
            print (df[col].unique())

            
def remove_col (df, columns):
    for col in columns:
        if col in df.columns:
            del df[col]
    return df


def grup_age (age, i):
    if pd.isnull(age):
        return False
    return int(int(age)/10) == i


def add_new_cols (df):
    for i in range(12):
        new_col = "Age_grup_" + str(i)
        df[new_col] = df['Age'].map(lambda age: grup_age (age, i)).map({True: 1.0,False: 0.0})
    return df


def split_cols(df, l):
    for col in l :
        if df[col].isnull().values.any():
            #df = add_nan_col(df, col)
            print("NAN ", col, " : ", df[col].isnull().sum())
        df = split_col(df, col)
    return df


def scale_cols(df, l):
    for col in l :
        if df[col].isnull().values.any():
            #df = add_nan_col(df, col)
            print("NAN ", col, " : ", df[col].isnull().sum())
        df = scale_col(df, col)
    return df
            
    
#preprocessing
def preprocessing (df):
    id = df['PassengerId']
    df = remove_col (df, ["PassengerId","Name", "Ticket", "Cabin"])
    df = df.rename(columns={"Survived": "y"})
    print(df.columns)
    print("NEW COLUMNS")
    df = add_new_cols (df)
    df = split_cols (df,["Pclass", "Sex", "Embarked"])
    df = scale_cols (df,["SibSp", "Parch", "Fare", "Age"])
    return (id, df)



In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
(train_id, train) = preprocessing(train)
(test_id, test) = preprocessing(test)

Index(['y', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
NEW COLUMNS
NAN  Embarked  :  2
SibSp 0.0 8.0
[0.125 0.    0.375 0.5   0.25  0.625 1.   ]
Parch 0.0 6.0
[0.         0.16666667 0.33333333 0.83333333 0.5        0.66666667
 1.        ]
Fare 0.0 512.3292
[0.01415106 0.13913574 0.01546857 0.1036443  0.01571255 0.0165095
 0.10122886 0.04113566 0.02173075 0.05869429 0.03259623 0.05182215
 0.06104473 0.01533038 0.03122992 0.05684821 0.02537431 0.03513366
 0.01410226 0.05074862 0.01567195 0.06929139 0.06126432 0.51334181
 0.01537917 0.01541158 0.0541074  0.28598956 0.01512699 0.02049464
 0.16038672 0.10149724 0.01411046 0.02194234 0.01849397 0.04098927
 0.08115719 0.03025399 0.04231498 0.03474329 0.07746484 0.01522459
 0.14976542 0.12097534 0.05416439 0.0915427  0.1561496  0.16293235
 0.05445717 0.02975782 0.01592394 0.01690807 0.14346245 0.02821272
 0.11027246 0.01493181 0.05660423 0.02434958 0.01756683 0.01854277
 0.01520019 0.09193308 0.03093714 0.06

In [4]:
print_data(train)

y                             2 int64
[0 1]
Age                          88 float64
SibSp                         7 float64
[0.125 0.    0.375 0.5   0.25  0.625 1.   ]
Parch                         7 float64
[0.         0.16666667 0.33333333 0.83333333 0.5        0.66666667
 1.        ]
Fare                        248 float64
Age_grup_0                    2 float64
[0. 1.]
Age_grup_1                    2 float64
[0. 1.]
Age_grup_2                    2 float64
[1. 0.]
Age_grup_3                    2 float64
[0. 1.]
Age_grup_4                    2 float64
[0. 1.]
Age_grup_5                    2 float64
[0. 1.]
Age_grup_6                    2 float64
[0. 1.]
Age_grup_7                    2 float64
[0. 1.]
Age_grup_8                    2 float64
[0. 1.]
Age_grup_9                    1 float64
[0.]
Age_grup_10                   1 float64
[0.]
Age_grup_11                   1 float64
[0.]
Pclass_3                      2 float64
[1. 0.]
Pclass_1                      2 float64
[0. 1.]
Pclass_2 

In [5]:
float#MODEL
def train_model (df):
    X = (df.loc[:, df.columns != 'y']).to_numpy()
    y = df['y'].to_numpy()

    clf = LogisticRegression(C=0.5, solver='sag', max_iter=10000)
    clf.fit(X, y)

    y_pred = clf.predict(X)

    y_pred_proba_train = clf.predict_proba(X)
    
    print('accuracy: {}'.format(accuracy_score(y, y_pred)))
    return clf
          
          
def get_ans (clf, df):
    X = (df.loc[:, df.columns != 'y']).to_numpy()
    y_pred = clf.predict(X)
    return y_pred


print(test.columns)
print(train.columns)
clf = train_model(train)
y_pred = get_ans (clf, test)
pass_id = test_id.to_numpy()
result = pd.DataFrame({'PassengerId': pass_id, 'Survived': y_pred})

result.to_csv('titanic.csv', index=False)  

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Age_grup_0', 'Age_grup_1',
       'Age_grup_2', 'Age_grup_3', 'Age_grup_4', 'Age_grup_5', 'Age_grup_6',
       'Age_grup_7', 'Age_grup_8', 'Age_grup_9', 'Age_grup_10', 'Age_grup_11',
       'Pclass_3', 'Pclass_2', 'Pclass_1', 'Sex_male', 'Sex_female',
       'Embarked_Q', 'Embarked_S', 'Embarked_C'],
      dtype='object')
Index(['y', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_grup_0', 'Age_grup_1',
       'Age_grup_2', 'Age_grup_3', 'Age_grup_4', 'Age_grup_5', 'Age_grup_6',
       'Age_grup_7', 'Age_grup_8', 'Age_grup_9', 'Age_grup_10', 'Age_grup_11',
       'Pclass_3', 'Pclass_1', 'Pclass_2', 'Sex_male', 'Sex_female',
       'Embarked_S', 'Embarked_C', 'Embarked_Q'],
      dtype='object')
accuracy: 0.8080808080808081
