In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score

from lightgbm import LGBMClassifier
#!pip3 install -U lazypredict
import lazypredict
from lazypredict.Supervised import LazyClassifier

import time
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output


ModuleNotFoundError: No module named 'lazypredict'

In [None]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train.shape, test.shape

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.isnull().sum()

# **EDA**

In [None]:
sns.countplot(train['Transported'])

In [None]:
HomeTransported = train.groupby(['HomePlanet', 'Transported']).size()
HomeTransported

In [None]:
DestinationTransported = train.groupby(['Transported', 'Destination']).size()
DestinationTransported

In [3]:
# age distribution
sns.distplot(train['Age'], color='red', label='train')
sns.distplot(test['Age'], color='green', label='test')
plt.legend()

NameError: name 'train' is not defined

In [None]:
# count plots for the unique features
cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
fig, ax = plt.subplots(2,2, figsize=(15, 12))
for i, col in enumerate(cols):
    plt.subplot(2,2,i+1)
    sns.countplot(train[col])
    plt.title(col)

In [None]:
# count plot in relation to the target feature(Transported)
cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
fig, ax = plt.subplots(2,2, figsize=(15, 12))
for i, col in enumerate(cols):
    plt.subplot(2,2,i+1)
    sns.countplot(train[col], hue=train['Transported'])
    plt.title(col)

In [None]:
sns.heatmap(train.corr(), annot=True)

In [None]:
# imputing missing values
imputer_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
imputer = SimpleImputer(strategy='median')
imputer.fit(train[imputer_cols])
train[imputer_cols] = imputer.transform(train[imputer_cols])
test[imputer_cols] = imputer.transform(test[imputer_cols])

In [None]:
train['HomePlanet'].fillna('Z', inplace=True)
test['HomePlanet'].fillna('Z', inplace=True)
train.isnull().sum()

In [None]:
# encoding categorical features
label_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
def label_encoder(train, test, columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] = LabelEncoder().fit_transform(test[col])
    return train, test
train, test = label_encoder(train, test, label_cols)

In [None]:
train.drop('PassengerId', axis=1, inplace=True)

train.drop(['Name', 'Cabin'], axis=1, inplace=True)
test.drop(['Name', 'Cabin'], axis=1, inplace=True)
X = train.drop('Transported', axis=1)
y = train['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, test_size=0.33)

# **Modeling**

In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, 
                     predictions=False,random_state=12, classifiers='all')
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
#clear_output()

In [None]:
models[:15]

# **5 fold LGBM Classifier**

In [None]:
lgb_params = {
    'objective': 'binary', 
    'n_estimators': 50,
    'learning_rate': 0.08
}
lgb_predictions = 0
lgb_scores = []
lgb_fimp = []
LGBM_FEATURES = list(train.columns)[:-1]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train[LGBM_FEATURES], train['Transported'])):
    print('')
    start_time = time.time()
    
    X_train, X_valid =train.iloc[train_idx][LGBM_FEATURES],train.iloc[valid_idx][LGBM_FEATURES]
    y_train, y_valid = train['Transported'].iloc[train_idx], train['Transported'].iloc[valid_idx]
    
    model = LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train, verbose=0)
    
    preds_valid = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds_valid)
    lgb_scores.append(acc)
    run_time = time.time() - start_time
    
    fim = pd.DataFrame(index=LGBM_FEATURES, data=model.feature_importances_,
                      columns=[f'{fold}_importance'])
    lgb_fimp.append(fim)
    
    print(f'Fold={fold+1}, Accuracy score: {acc:.2f}%, Run Time: {run_time:.2f}s')
    test_preds = model.predict(test[LGBM_FEATURES])
    lgb_predictions += test_preds/5
    print('')
print('Mean Accuracy: ', np.mean(lgb_scores))

In [None]:
lgbm_fis_df = pd.concat(lgb_fimp, axis=1).head(15)
lgbm_fis_df.sort_values('1_importance').plot(kind='barh', figsize=(15, 10), title='Feature Importance Across Folds')
plt.show()

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': lgb_predictions.astype('bool')})
output.to_csv('submission.csv', index=False)
output.head()