In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import zipfile
import pickle
from os.path import join as path, dirname
try:
    from IPython.core.display import HTML

    def pprint(df):
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            display(HTML(pd.DataFrame(df).to_html()))
except:
    def pprint(df):
        print(df)

In [7]:
print('Training data:')
df_train = pd.read_csv(path('data', 'train.csv'), index_col='PassengerId')
print(df_train.shape)
pprint(df_train.head())


print('Test data:')
df_test = pd.read_csv(path('data', 'test.csv'), index_col='PassengerId')
print(df_test.shape)
pprint(df_test.head())

Training data:
(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Test data:
(418, 10)


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
features_train = ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
features_test = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
X_full = df_train[features_train]
X_test_full = df_test[features_test]

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_full.dropna(axis=0, subset=['Survived'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [10]:
def crear_model(xtrain, ytrain, prep, n_estimators=100):
    # Define model
    model = RandomForestRegressor(n_estimators, random_state=0)

    # Bundle preprocessing and modeling code in a pipeline
    clf = Pipeline(steps=[('preprocessor', prep),
                          ('model', model)
                         ])

    # Preprocessing of training data, fit model 
    clf.fit(X_train, y_train)
    
    return clf


def avaluar_model(xvalid, yvalid, c, accept, validacio=True):
    # Preprocessing of validation data, get predictions
    preds = c.predict(xvalid)
    preds = [0 if x<accept else 1 for x in preds]
    if (validacio):
        #Check model
        encerts = 0
        total = 0
        for i in range(len(preds)):
            if preds[i] == yvalid.iloc[i]:
                encerts+=1
            total+=1

        prob = encerts/total
        return (preds, prob)
    
    return preds

In [12]:
best = 0
best_n = 0
best_accept = 0
for i in range(40, 71, 1):
    c = crear_model(X_train, y_train, preprocessor, i)
    for j in range(0, 100, 1):
        preds, prob = avaluar_model(X_valid, y_valid, c, j/100.)
        if(prob>best):
            best = prob
            best_n = i
            best_accept = j
    if (i%10 == 0):
        print ("Iteracio: {}, millor prob: {}, millor n: {}, millor accept: {}".format(i, best, best_n, best_accept))
print(best)
print(best_n)

Iteracio: 40, millor prob: 0.8547486033519553, millor n: 40, millor accept: 43
Iteracio: 50, millor prob: 0.8603351955307262, millor n: 47, millor accept: 45
Iteracio: 60, millor prob: 0.8603351955307262, millor n: 47, millor accept: 45
Iteracio: 70, millor prob: 0.8603351955307262, millor n: 47, millor accept: 45
0.8603351955307262
47


In [13]:
#millor: 0.8715083798882681
#48

In [16]:
#Predict test
c = crear_model(X_train, y_train, preprocessor, best_n)
preds = avaluar_model(X_test_full, None, c, best_accept/100., False)
results = pd.Series(preds)

results.index.name = 'PassengerId'
results.index = X_test_full.index
results.name = 'Survived'
results.to_frame().to_csv('submission.csv')



In [18]:
best_submission = pd.read_csv("best_submission.csv", index_col='PassengerId')
print(type(best_submission["Survived"]))
for item, value in best_submission["Survived"].iteritems():
    if (item != "Survived"):
        if(value != results.loc[item]):
            print("Diferencia")

<class 'pandas.core.series.Series'>
