In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier  

# ----- import data -----
train = pd.read_csv('../input/titanic/train.csv') # 891 entries
test = pd.read_csv('../input/titanic/test.csv') # 418 entries

# merge training and testing data (for now)
dataframes = [train, test]
df = pd.concat(dataframes)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [2]:
# Function to calculate missing values by column
def missing_values_table(df):
   
    # Total missing values
    mis_val = df.isnull().sum()
    
    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    
    # Sort the table by percentage of missing descending
    # .iloc[:, 1]!= 0: filter on missing missing values not equal to zero
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(2)  # round(2), keep 2 digits
    
    # Print some summary information
    print("Your selected dataframe has {} columns.".format(df.shape[1]) + '\n' + 
    "There are {} columns that have missing values.".format(mis_val_table_ren_columns.shape[0]))
    
    # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [3]:
#missing values table
missing_values_table(df)

Your selected dataframe has 12 columns.
There are 5 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Cabin,1014,77.46
Survived,418,31.93
Age,263,20.09
Embarked,2,0.15
Fare,1,0.08


In [4]:
# ----- deal with missing & categorical values -----

# drop Cabin since it has many missing values
# drop Name and Ticket since they're categorical and we won't use them (for now)
df = df.drop(columns=['Cabin', 'Name', 'Ticket'])

# convert to numerical values (0,1)
df['Sex'] = (df['Sex'] == 'male').astype(int)

# convert to numerical values (0,1,2)
df.loc[df['Embarked'] == 'C', 'Embarked'] = 0
df.loc[df['Embarked'] == 'Q', 'Embarked'] = 1
df.loc[df['Embarked'] == 'S', 'Embarked'] = 2

# round float numbers to two decimals
df.round(2)

# ----- impute missing values -----
num_cols = ['Age', 'Fare']
cat_cols = ['Embarked']

# use mean value for the numerical columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer=imputer.fit(df[num_cols])
df[num_cols] = imputer.transform(df[num_cols]) 

# most frequent value for the categorical columns
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer=imputer.fit(df[cat_cols])
df[cat_cols] = imputer.transform(df[cat_cols]) 

In [5]:
#missing values table
missing_values_table(df)

Your selected dataframe has 9 columns.
There are 1 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Survived,418,31.93


In [6]:
# unmerge
train_df = df.iloc[:891,]
test_df = df.iloc[891:,]

In [7]:
# ----- training model -----
# separate features and target
y = train_df['Survived']
X = train_df.drop('Survived', axis=1)

In [8]:
# perform smote
sm = SMOTE(random_state = 2)
X_smoted, y_smoted = sm.fit_sample(X, y)

In [9]:
GBI = GradientBoostingClassifier(learning_rate=0.05, max_depth=3,
                                 max_features=0.5,
                                 random_state=42)

GBI.fit(X_smoted, y_smoted)

GradientBoostingClassifier(learning_rate=0.05, max_features=0.5,
                           random_state=42)

In [10]:
# ----- testing model -------
X_test = pd.get_dummies(test_df).drop('Survived', axis=1)

In [11]:
# aaaaaand predict
predictions = GBI.predict(X_test)

In [None]:
# output file 
result = pd.DataFrame({'PassengerId': test.PassengerId,
                       'Survived': predictions.astype(int)})
result.to_csv('titanic_attempt_1.csv', index=False)