In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("data/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv("data/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Data Dictionary


| Variable | Definition    | Key |
|------    |------         |     |
|survival  |	Survival   |	0 = No, 1 = Yes|
|pclass	   |Ticket class   |1 = 1st, 2 = 2nd, 3 = 3rd|
|sex       |Sex|           |	
|Age	   |Age in years   ||
|sibsp	   |# of siblings / spouses aboard the Titanic||
|parch	   |# of parents / children aboard the Titanic||
|ticket	   |Ticket number	||
|fare	   |Passenger fare	||
|cabin	   |Cabin number	||
|embarked  |Port of Embarkation	|C = Cherbourg, Q = Queenstown, S = Southampton|

### Variable Notes

pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


### Visualize data

In [9]:
columns = [col for col in dataset.columns if dataset[col].isnull().sum()>0]

sns.barplot(x = missing_data.index, y = percent)
plt.title("Percent missing data by feature")
plt.xlabel("Features")
plt.ylabel("Percent of missing values")
plt.xticks(rotation = '90')

NameError: name 'dataset' is not defined

### Feature Engineering

In [None]:
import string

def substrings_in_string(big_string, title_list):
    for title in title_list:
        if big_string.find(title)!=-1:
            return title
    print(big_string)
    return np.nan
    

In [None]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

In [None]:
dataset['Title']=dataset['Name'].map(lambda x: substrings_in_string(x, title_list))

In [None]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [None]:
dataset['Title']=dataset.apply(replace_titles, axis=1)

In [None]:
dataset.head()

In [None]:
def cabin_transformer(ds):
    ds["Cabin"] = ds["Cabin"].isnull()
    array = []
    for cabin in ds["Cabin"]:
        if cabin == True:
            array.append(0)
        else:
            array.append(1)
            
    return array

In [None]:
pn = cabin_transformer(dataset)

In [None]:
dataset["Cabin"] = pn

In [None]:
dataset.head()

In [None]:
dataset["Age"] = dataset["Age"].fillna(dataset["Age"].mean())

In [None]:
dataset["Age * Pclass"] = dataset["Age"] * dataset["Pclass"]

In [None]:
dataset["Family Size"] = dataset["SibSp"] + dataset["Parch"]

In [None]:
dataset["Family Size"].idxmax()

In [None]:
dataset.head()

In [None]:
dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].sort_values(ascending = False).median())

In [None]:
dataset["Embarked"] = dataset["Embarked"].fillna(dataset["Embarked"].mode()[0])

In [None]:
dataset.drop(["Name", "Ticket"], axis = 1, inplace  = True)

In [None]:
dataset.head()

In [None]:
dataset.info()

## Perform One Hot Encoding Vector

In [None]:
columns = dataset.select_dtypes(include = ["object"]).columns

In [None]:
from sklearn.preprocessing import OneHotEncoder

for col in columns:
    encoded = pd.get_dummies(dataset[col])
    dataset.drop([col], axis = 1, inplace = True)
    dataset = pd.concat([dataset, encoded], axis = 1)

In [None]:
dataset.head()

## Train a model

In [None]:
dataset.shape

In [None]:
X_train = dataset[:len(train)]
y_train = y
X_test = dataset[len(train):]

In [None]:
##Does not work well

#from sklearn.linear_model import SGDClassifier

#sgd_clf = SGDClassifier()
#sgd_clf.fit(X_train, y_train)

In [None]:
#Gives accuracy of 0.66

#from sklearn.neighbors import KNeighborsClassifier

#knn_clf = KNeighborsClassifier(n_neighbors = 4, weights = 'distance')
#knn_clf.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C=2.8000100000000003, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)
len(y_pred)

In [None]:
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv("data/gender_submission.csv")
ds = pd.concat([sub_df["PassengerId"].astype(int), pd.Series(y_pred)], axis = 1)

In [None]:
ds.columns = ["PassengerId", "Survived"]
ds.head()

In [None]:
ds.to_csv("ludjii.csv", index = False)

In [None]:
1dataset.drop(["Cabin"], axis = 1, inplace = True)

In [None]:
dataset.info()

In [None]:
dataset["Age"] = dataset["Age"].fillna(dataset["Age"].mean())

In [None]:
dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].mean())

In [None]:
dataset.drop(["Name"], axis = 1, inplace = True)

In [None]:
columns = [column for column in dataset.columns if np.dtype(dataset[column]) == np.dtype('O')]    

In [None]:
for col in columns:
    print(col, ":", len(dataset[col].unique()))

In [None]:
columns

In [None]:
def category_onehot(columns):
    df = dataset
    i = 0
    
    for col in columns:
        
        print(col)
        df1 = pd.get_dummies(dataset[col], drop_first = True)
        dataset.drop([col], axis = 1, inplace = True)
        
        if i == 0:
            df = df1.copy()
        else:
            df = pd.concat([df, df1], axis = 1)
        i +=1
    
    df = pd.concat([dataset, df], axis = 1)
    
    return df

In [None]:
dataset = category_onehot(columns)

In [None]:
dataset = dataset.loc[:,~dataset.columns.duplicated()]

In [None]:
dataset.shape

In [None]:
dataset.head()

In [None]:
X_train = dataset[:train_objs_num]

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y)


In [None]:
y_pred = sgd_clf.predict(dataset[train_objs_num:])

In [None]:
y_pred

In [None]:
len(y_pred)

In [None]:
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv("data/gender_submission.csv")
ds = pd.concat([sub_df["PassengerId"].astype(int), pd.Series(y_pred)], axis = 1)

In [None]:
ds.columns = ["PassengerId", "Survived"]
ds["PassengerId"].dropna()
ds.to_csv("up.csv", index = False)