In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.pandas.set_option('display.max_columns',None)

In [None]:
train=pd.read_csv('../input/spaceship-titanic/train.csv')


In [None]:
test=pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('Shape of train data {} and shape of test data {}'.format(train.shape,test.shape))

In [None]:
train.describe()

We can see that there is no person whose age is greater than 79, we can also see
50% of passengers are adults i.e age<27

In [None]:
train.info()

In [None]:
sns.countplot(train.dtypes)
plt.show()

We can see in train data object type features are more.

**Extracting names of features that contains numeric data i.e int or float**

In [None]:
numeric_features_train=[f for f in train.columns if train[f].dtypes!='O' and train[f].dtypes!='bool']
numeric_features_train

In [None]:
sns.pairplot(data=train[numeric_features_train])

**Plotting histogram to see the distribution**

In [None]:
for i,j in enumerate(numeric_features_train):
  plt.figure(figsize=(8,5))
  sns.histplot(data=train[j],kde=True,bins=30)
  plt.show()


We can see data is not normally distributed we will handle this later.

In [None]:
train=train.drop(['PassengerId','Name'],axis=1)

In [None]:
categorical_features_train=[f for f in train.columns if train[f].dtypes=='O']
categorical_features_train

In [None]:
for i,j in enumerate(categorical_features_train):
  plt.figure(figsize=(8,5))
  if j!='Cabin':
    sns.countplot('Transported',hue=j,data=train)
    plt.show()

In [None]:
sns.countplot('Transported',data=train)

we can see that both transported and not transported are almost equal.

In [None]:
train.groupby('VIP')['Transported'].sum().plot(kind='pie',autopct="%.0f%%",colors=['red','orange'])

# **Handling Missing Values**

In [None]:
sns.heatmap(train.isnull(),yticklabels=False)
plt.show()

we can see there are null values in almost every column now we can handle them.

In [None]:
train.isnull().sum()



> **Function to handle missing values for numeric features.**





In [None]:
def remove_numeric_null(dataset,col):
  for f in col:
    dataset[f]=dataset[f].fillna(dataset[f].mean())
  return dataset



> **Function to handle missing values for categorical features.**



In [None]:
def remove_categorical_null(dataset,col):
  for f in col:
    dataset[f]=dataset[f].fillna(dataset[f].mode()[0])
  return dataset

In [None]:
train[numeric_features_train].isnull().sum()

We can see almost in all numeric features there are null features. 😢

In [None]:
train=remove_numeric_null(train.copy(),numeric_features_train)
train[numeric_features_train].isnull().sum()

We have removed missing values for numeric data 😀


> Lets Handle missing values for categorical features.



In [None]:
train[categorical_features_train].isnull().sum()

We can see every categorical feature has null values. Let's Handle them now!

In [None]:
train=remove_categorical_null(train.copy(),categorical_features_train)
train[categorical_features_train].isnull().sum()

Boom 💣 We have handled missing values both for numeric and categorical features. 😎



> **Doing same work for test data now.**



In [None]:
test.info()

In [None]:
numeric_features_test=[f for f in test.columns if test[f].dtypes!='O' and test[f].dtypes!='bool']
print(f'Numeric Features: {numeric_features_test}')

Saving passenger id for submission purpose.

In [None]:
test_id=test['PassengerId']

In [None]:
test=test.drop(['PassengerId','Name'],axis=1)

In [None]:

categorical_features_test=[f for f in test.columns if test[f].dtypes=='O']
print(f'Categorical Features : {categorical_features_test}')

In [None]:
test[numeric_features_test].isnull().sum()

We can see in test all numeric features have null values as well. Lets handle them!

In [None]:
test=remove_numeric_null(test.copy(),numeric_features_test)
test[numeric_features_test].isnull().sum()

Let's handle missing values for categorical features 😎

In [None]:
test[categorical_features_test].isnull().sum()

All categorical features have null values.

In [None]:
test=remove_categorical_null(test.copy(),categorical_features_test)
test[categorical_features_test].isnull().sum()

So far we have handled missing values for bot train and test data, we can see there are categorical features as well as features with bigger range let's label the categorical features and scale the numeric features.

In [None]:
train[['Deck','Num','Side']] = train['Cabin'].str.split('/',expand=True)
train.head(2)

In [None]:
test[['Deck','Num','Side']] = test['Cabin'].str.split('/',expand=True)
test.head(2)

In [None]:
train=train.drop('Cabin',axis=1)
test=test.drop('Cabin',axis=1)

As we saw earlier data is not normally distributed we will make it normally distributed by using log function. log(valueincolumn+c) where c is any constant I have took c=0.00001 very close to zero as data contains zeros values as well and we know log(0) is not defined.

In [None]:
def normalize(dataset,col):
    dataset[col]=np.log(dataset[col]+0.00001)
    return dataset

In [None]:
for col in numeric_features_train:
    train=normalize(train.copy(),col)
    test=normalize(test.copy(),col)

# **Labeling Data**

Labeling categorical features who has 2 categories like VIP contains True or false so I labeled it as if it is True replace it with 1 else 0.

In [None]:
test['CryoSleep']=np.where(test['CryoSleep']==False,0,1)
train['CryoSleep']=np.where(train['CryoSleep']==False,0,1)


Trying different way to label by using lambda fucntion.

In [None]:
test['VIP']=test['VIP'].apply(lambda val: 1 if val==True else 0)
train['VIP']=train['VIP'].apply(lambda val: 1 if val==True else 0)

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
train['Num']=train.Num.astype('int64')
test['Num']=test.Num.astype('int64')

In [None]:
train.info()

In [None]:
test.info()



> **Function to create dummy values**: It works like if we have three values in a column like 'a', 'b' and 'c' what it will do it will create 3 columns (column 'a', column 'b' and column 'c') and where it occurs it will replace it with 1 else 0. And drop_first will drop the first column what it means if we have 3 values it will return only 2 columns.



In [None]:
def dummy_values(dataset,col):
  dummy_col=pd.get_dummies(dataset[col])
  dataset=dataset.drop(col,axis=1)
  dataset=pd.concat([dataset,dummy_col],axis=1)
  return dataset

In [None]:
cat=['HomePlanet','Destination','Deck','Side']
for f in cat:
  train=dummy_values(train.copy(),f)
  test=dummy_values(test.copy(),f)


In [None]:
train.head(2)

In [None]:
test.head(2)

We have labelled our columns now lets label the target value that is **'Transported'**

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
train['Transported']=encoder.fit_transform(train['Transported'])
train.head(2)

# **Modeling**

We have done the necessary preprocessing steps let's move on to modelling part now.



> **I will be using multiple models like Logistic Regression, SVM, DecisionTree, for improving DecisionTree I will use RandomForest, and will also use GridSearchCV/RandomSearchCV to choose best parameters for the model (doing hyperparameter tuning), to improve RandomForest I will use GradientBoosting and will also use other ensemble methods like Stacking, XGBoost etc**.



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,StackingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier




In [None]:
X=train.drop('Transported',axis=1)
y=train['Transported']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3)

# **Logistic Regression**

In [None]:
logreg=LogisticRegression()
logreg.fit(xtrain,ytrain)

In [None]:
logreg.score(xtest,ytest)

# **SVM**



In [None]:
svm=SVC()
svm.fit(xtrain,ytrain)


In [None]:
svm.score(xtest,ytest)

I haven't set any parameters let's hypertune the parameters and check accuracy by using GridSearchCV.

# **GridSearchCV**

In [None]:
param={'C':[0.01,0.1,1,5,10,26]
}
       
svm1=SVC()
cv=GridSearchCV(svm1,param_grid=param,cv=6)
cv.fit(xtrain,ytrain)

In [None]:
cv.best_score_

In [None]:
cv.best_estimator_

In [None]:
cv.score(xtest,ytest)

**As we can see it improved our accuracy. 😀**

# **KNearestClassifier**

In [None]:
knn=KNeighborsClassifier(n_neighbors=15)
knn.fit(xtrain,ytrain)

In [None]:
knn.score(xtest,ytest)

# **DecisionTreeClassifier**

In [None]:
dt=DecisionTreeClassifier(max_depth=8)
dt.fit(xtrain,ytrain)

In [None]:
dt.score(xtest,ytest)

You can try also other parameters it may increase the accuracy I have used just max_Depth parameter for now. Now let's imporve the accuracy by using RandomForestClassifier.

# **RandomForestClassifier**

In [None]:
rf=RandomForestClassifier(n_estimators=600, max_depth=15,min_samples_split=20, random_state=42)
rf.fit(xtrain,ytrain)


In [None]:
rf.score(xtest,ytest)

We have imporved accuracy by using RandomForestClassifier. Let's try on GradientBoostingClassifier now.

In [None]:
gd=GradientBoostingClassifier(learning_rate=0.03,n_estimators=450,max_depth=9,min_samples_split=8)
gd.fit(xtrain,ytrain)

In [None]:
gd.score(xtest,ytest)

# **XGBOOST**

In [None]:
xgb=XGBClassifier(learning_rate=0.07,n_estimators=670)
xgb.fit(xtrain,ytrain)

In [None]:
xgb.score(xtest,ytest)

We have improved accuracy.

# **LightGBM**

In [None]:
lgb=LGBMClassifier(learning_rate=0.06,n_estimators=800,max_depth=10,boosting_type='gbdt',objective='binary')
lgb.fit(xtrain,ytrain)

In [None]:
lgb.score(xtest,ytest)

Let's check on stacking.

# **Stacking**

In [None]:
model1=LGBMClassifier(learning_rate=0.06,n_estimators=800,max_depth=10,boosting_type='gbdt',objective='binary')
model2 = RandomForestClassifier(n_estimators=600, max_depth=15,min_samples_split=20, random_state=42)
model3=GradientBoostingClassifier(learning_rate=0.05,n_estimators=250,max_depth=7,min_samples_split=8)
model4 = XGBClassifier(learning_rate=0.07,n_estimators=670)
estimator = [('model1',model1),('model2',model2),('model3',model3),('model4',model4)]
lr = LogisticRegression(C=0.01)
model = StackingClassifier(estimators=estimator,final_estimator=lr)
model.fit(xtrain,ytrain)

In [None]:
model.score(xtest,ytest)

In [None]:
ypred=model.predict(test)

In [None]:
y_pred=np.where(ypred==0,False,True)
y_pred[:5]

In [None]:
df=pd.DataFrame({'PassengerId':test_id,
                 'Transported':y_pred})
df.to_csv('sub13.csv',index=False)
df.head()