In [35]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn import metrics

In [36]:
X_full = pd.read_csv('train.csv')
X_test_full = pd.read_csv('test.csv')

In [37]:
y = X_full['Survived']
X_full.drop(['Survived'], axis=1, inplace=True)
X_full.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

In [38]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.80, test_size=0.20, random_state=3)

In [39]:
categorical_features = [fname for fname in X_full.columns
                        if X_full[fname].dtype == 'object']

encoder = LabelEncoder()
X_train_full[categorical_features] = (X_train_full[categorical_features].astype(str))
X_valid_full[categorical_features] = (X_valid_full[categorical_features].astype(str))

encoded_X_train_full = X_train_full[categorical_features].apply(encoder.fit_transform)
numerical_X_train_full = X_train_full.drop(categorical_features, axis=1)
X_train_full = pd.concat([numerical_X_train_full, encoded_X_train_full], axis=1)

encoded_X_valid_full = X_valid_full[categorical_features].apply(encoder.fit_transform)
numerical_X_valid_full = X_valid_full.drop(categorical_features, axis=1)
X_valid_full = pd.concat([numerical_X_valid_full, encoded_X_valid_full], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [40]:
#For all features that have missing values, fill them in accordingly
imputer = SimpleImputer()
imputed_X_train_full = pd.DataFrame(imputer.fit_transform(X_train_full))
imputed_X_valid_full = pd.DataFrame(imputer.transform(X_valid_full))

imputed_X_train_full.columns = X_train_full.columns
imputed_X_valid_full.columns = X_valid_full.columns

features = ['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Fare']
X_train = imputed_X_train_full[features].copy()
X_valid = imputed_X_valid_full[features].copy()

In [41]:
X_train_dataset = lgb.Dataset(X_train[features], label=y_train)
X_valid_dataset = lgb.Dataset(X_valid[features], label=y_valid)

param = {'num_leaves' : 64, 'objective' : 'binary'}
param['metric'] = 'auc'

lgb_model = lgb.train(param, X_train_dataset, 1000, valid_sets = [X_valid_dataset], verbose_eval=False)

In [42]:
X_test_full.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
X_test_full[categorical_features] = (X_test_full[categorical_features].astype(str))

In [43]:
y_pred = lgb_model.predict(X_valid)
print(metrics.roc_auc_score(y_valid, y_pred))

0.8477064220183486


In [44]:
encoded_X_test_full = X_test_full[categorical_features].apply(encoder.fit_transform)
numerical_X_test_full = X_test_full.drop(categorical_features, axis=1)
X_test_full = pd.concat([numerical_X_test_full, encoded_X_test_full], axis=1)

print(len(X_test_full))

imputed_X_test_full = pd.DataFrame(imputer.transform(X_test_full))
imputed_X_test_full.columns = X_test_full.columns
X_test = imputed_X_test_full[features].copy()

print(len(X_test))

418
418


In [45]:
submission = pd.DataFrame()
test_pred = np.round(lgb_model.predict(X_test, lgb_model.best_iteration)).astype(int)
submission = pd.DataFrame({'PassengerId': imputed_X_test_full.PassengerId,
                       'Survived': test_pred})
submission = submission.astype({'PassengerId':int, 'Survived':int})
submission.to_csv('submission.csv', index=False)
print(submission)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
