In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Read the training data
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
train_df.head()

**To See if there are missing values in the training data**

In [3]:
missing_data = train_df.isnull()
missing_data.head(5)

In [4]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")   

In [5]:
# Replace the missing values with the maximum frequent value in categorical variables

max_hom_p = train_df['HomePlanet'].value_counts().idxmax()
max_cry = train_df['CryoSleep'].value_counts().idxmax()
max_cab = train_df['Cabin'].value_counts().idxmax()
max_dest = train_df['Destination'].value_counts().idxmax()
max_vip = train_df['VIP'].value_counts().idxmax()
train_df["HomePlanet"].replace(np.nan, max_hom_p , inplace=True)
train_df["CryoSleep"].replace(np.nan, max_cry, inplace=True)
train_df["Cabin"].replace(np.nan, max_cab , inplace=True)
train_df["Destination"].replace(np.nan, max_dest , inplace=True)
train_df["VIP"].replace(np.nan, max_vip , inplace=True)


In [6]:
# Replace the missing values with the mean value in continious variables

train_df['Age'].replace(np.nan, train_df['Age'].mean(), inplace=True)
train_df['RoomService'].replace(np.nan, train_df['RoomService'].mean(), inplace=True)
train_df['FoodCourt'].replace(np.nan, train_df['FoodCourt'].mean(), inplace=True)
train_df['ShoppingMall'].replace(np.nan, train_df['ShoppingMall'].mean(), inplace=True)
train_df['Spa'].replace(np.nan, train_df['Spa'].mean(), inplace=True)
train_df['VRDeck'].replace(np.nan, train_df['VRDeck'].mean(), inplace=True)


In [7]:
# Check if correct data format are used
train_df.dtypes

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the numerical variables 
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_df[numerical] = scaler.fit_transform(train_df[numerical])

# Show an example of a record with scaling applied
train_df.head()

In [9]:
from sklearn import preprocessing

# Hot Encoding for the categorical variables
le = preprocessing.LabelEncoder()
categorical = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
train_df_transform = train_df
train_df_transform['HomePlanet'] = le.fit_transform(train_df_transform['HomePlanet'])
train_df_transform['Destination'] = le.fit_transform(train_df_transform['Destination'])
train_df_transform['VIP'] = le.fit_transform(train_df_transform['VIP'])
train_df_transform['CryoSleep'] = le.fit_transform(train_df_transform['CryoSleep'])
train_df_transform['Cabin'] = le.fit_transform(train_df_transform['Cabin'])
train_df_transform['Transported'] = le.fit_transform(train_df_transform['Transported'])


train_df_transform.head()

**Exploration of the variable distributions and some EDA between all variables and transported variable**

In [10]:
import matplotlib.pyplot as plt
%matplotlib inline
train_df_transform.hist()
plt.show()


In [11]:
import seaborn as sns
sns.boxplot(x='Transported', y = 'Age', data=train_df_transform)


In [12]:
sns.boxplot(x='Transported', y = 'Spa', data=train_df_transform)

In [13]:
sns.boxplot(x='Transported', y = 'FoodCourt', data=train_df_transform)


In [14]:
sns.boxplot(x='Transported', y = 'RoomService', data=train_df_transform)


In [15]:
sns.boxplot(x='Transported', y = 'ShoppingMall', data=train_df_transform)


In [16]:
sns.boxplot(x='Transported', y = 'VRDeck', data=train_df_transform)

In [17]:
sns.barplot(x='HomePlanet', y='Transported', data=train_df_transform, ci=None)


In [18]:
sns.barplot(x='Destination', y='Transported', data=train_df_transform, ci=None)


In [19]:
sns.barplot(x='CryoSleep', y='Transported', data=train_df_transform, ci=None)


In [20]:
sns.barplot(x='VIP', y='Transported', data=train_df_transform, ci=None)


In [21]:
import scipy

# Analysis the relationship between the categorical variables and the transported variables using Chi sqaure test
ct_table_home=pd.crosstab(train_df_transform["HomePlanet"],train_df_transform["Transported"])
chi2_stat_h, p_h, dof_h, expected_h = scipy.stats.chi2_contingency(ct_table_home)
ct_table_dest=pd.crosstab(train_df_transform["Destination"],train_df_transform["Transported"])
chi2_stat_d, p_d, dof_d, expected_d = scipy.stats.chi2_contingency(ct_table_dest)
ct_table_cryo=pd.crosstab(train_df_transform["CryoSleep"],train_df_transform["Transported"])
chi2_stat_c, p_c, dof_c, expected_c = scipy.stats.chi2_contingency(ct_table_cryo)
ct_table_vip=pd.crosstab(train_df_transform["VIP"],train_df_transform["Transported"])
chi2_stat_v, p_v, dof_v, expected_v = scipy.stats.chi2_contingency(ct_table_vip)
print(f"p-value for HomePlanet:            {p_h:.5g}")
print(f"p-value for Destination:            {p_d:.5g}")
print(f"p-value for CryoSleep:            {p_c:.5g}")
print(f"p-value for VIP:            {p_v:.5g}")
print(f"chi2 statistic_cryo:     {chi2_stat_c:.5g}")
print(f"degrees of freedom_cryo: {dof_c}")
print("expected frequencies cryo:\n",expected_c)



In [22]:
from scipy.stats import shapiro

# Check the normality of the numerical variables to choose the correct test for anlysis
n_age = shapiro(train_df_transform["Age"])
n_spa = shapiro(train_df_transform["Spa"])
n_vrdeck = shapiro(train_df_transform["VRDeck"])
n_fcourt = shapiro(train_df_transform["FoodCourt"])
n_rserv = shapiro(train_df_transform["RoomService"])
n_shmall = shapiro(train_df_transform["ShoppingMall"])
print(f"n_age_p           {n_age}")
print(f"n_spa_p:            {n_spa}")
print(f"n_vrdeck_p:            {n_vrdeck}")
print(f"n_fcourt_p:            {n_fcourt}")
print(f"n_rserv_p:            {n_rserv}")
print(f"n_shmall_p:            {n_shmall}")


In [23]:
train_df2= train_df_transform[['Age','Spa','VRDeck', 'FoodCourt', 'RoomService', 'ShoppingMall', 'Transported']]
train_df0= train_df2[train_df2['Transported'] == 0]
train_df1= train_df2[train_df2['Transported'] == 1]

In [24]:
from scipy.stats import mannwhitneyu

# Analysis the relationship between the numerical variables and the transported variables using Mann-Whitney U test
# as all numerical variables were non normally distributed
U_age, p_age = mannwhitneyu(train_df0['Age'], train_df1['Age'])
U_spa, p_spa = mannwhitneyu(train_df0['Spa'], train_df1['Spa'])
U_vrd, p_vrd = mannwhitneyu(train_df0['VRDeck'], train_df1['VRDeck'])
U_fc, p_fc = mannwhitneyu(train_df0['FoodCourt'], train_df1['FoodCourt'])
U_rs, p_rs = mannwhitneyu(train_df0['RoomService'], train_df1['RoomService'])
U_sm, p_sm = mannwhitneyu(train_df0['ShoppingMall'], train_df1['ShoppingMall'])
print(f"n_age_p           {p_age}")
print(f"n_spa_p:            {p_spa}")
print(f"n_vrdeck_p:            {p_vrd}")
print(f"n_fcourt_p:            {p_fc}")
print(f"n_rserv_p:            {p_rs}")
print(f"n_shmall_p:            {p_sm}")

In [25]:

#Dropping name columns (Name, Cabin) from Dataframe
train_f= train_df_transform.drop(['Name','Cabin'],axis=1)
xtrain = train_f.loc[:, 'PassengerId':'VRDeck']
ytrain = train_f.loc[:, 'Transported']

In [26]:
# Read the test data
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_df.head()

In [27]:
# Check if there are missing values in the test data 
missing_data2 = test_df.isnull()
for column in missing_data2.columns.values.tolist():
    print(column)
    print (missing_data2[column].value_counts())
    print("") 

**Make the same preprocessing steps of train data on test data**

In [28]:
# Replace the missing values with the maximum frequent value in categorical variables

max_hom_p = test_df['HomePlanet'].value_counts().idxmax()
max_cry = test_df['CryoSleep'].value_counts().idxmax()
max_cab = test_df['Cabin'].value_counts().idxmax()
max_dest = test_df['Destination'].value_counts().idxmax()
max_vip = test_df['VIP'].value_counts().idxmax()
test_df["HomePlanet"].replace(np.nan, max_hom_p , inplace=True)
test_df["CryoSleep"].replace(np.nan, max_cry, inplace=True)
test_df["Cabin"].replace(np.nan, max_cab , inplace=True)
test_df["Destination"].replace(np.nan, max_dest , inplace=True)
test_df["VIP"].replace(np.nan, max_vip , inplace=True)
test_df['Age'].replace(np.nan, test_df['Age'].mean(), inplace=True)
test_df['RoomService'].replace(np.nan, test_df['RoomService'].mean(), inplace=True)
test_df['FoodCourt'].replace(np.nan, test_df['FoodCourt'].mean(), inplace=True)
test_df['ShoppingMall'].replace(np.nan, test_df['ShoppingMall'].mean(), inplace=True)
test_df['Spa'].replace(np.nan, test_df['Spa'].mean(), inplace=True)
test_df['VRDeck'].replace(np.nan, test_df['VRDeck'].mean(), inplace=True)
test_df.head()


In [29]:
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

test_df[numerical] = scaler.fit_transform(test_df[numerical])

# Show an example of a record with scaling applied
test_df.head()

In [30]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
categorical = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
test_df_transform = test_df
test_df_transform['HomePlanet'] = le.fit_transform(test_df_transform['HomePlanet'])
test_df_transform['Destination'] = le.fit_transform(test_df_transform['Destination'])
test_df_transform['VIP'] = le.fit_transform(test_df_transform['VIP'])
test_df_transform['CryoSleep'] = le.fit_transform(test_df_transform['CryoSleep'])


test_df_transform.head()

In [31]:
# Drop name and cabin columns from test data also
test_f= test_df_transform.drop(['Name','Cabin'],axis=1)


**Models application and choosing the best one according to evalaution**

In [32]:
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Defining number of folds
num_folds = 10
num_instances = len(train_f)
seed = 7

# Preparing models
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('ADB', AdaBoostClassifier()))
models.append(('RF', RandomForestClassifier()))


# Model Evaluation
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits = num_folds)
    cv_results = model_selection.cross_val_score(model, xtrain, ytrain, cv = kfold, scoring = 'accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)



In [33]:
from sklearn.ensemble import GradientBoostingClassifier

Model2 = GradientBoostingClassifier(n_estimators = 850, learning_rate = 1, max_depth = 2, random_state = 0)

# Training model and checking the score
Model2.fit(xtrain, ytrain)
Model2.score(xtrain,ytrain)  

# Predictions
ypredgb=Model2.predict(test_f)

In [34]:
acc_log = round(Model2.score(xtrain,ytrain) * 100, 2)
acc_log

**According to previous steps we will choose gradiant boosting model for prediction**

In [35]:
# transfer transported into boolian
Transported = ypredgb.astype(bool)
Transported[0:5]

In [36]:
submission = pd.DataFrame({
        "PassengerId": test_f["PassengerId"],
        "Transported": Transported
    })
submission.shape

In [37]:
# submission.to_csv('.//submission.csv', index=False)