In [7]:
import pandas as pd 
import numpy as np 

import sklearn
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV

import random 
from collections import Counter
import matplotlib.pyplot as plt

import seaborn as sns
import os 
%matplotlib inline
%config InlineBackend.figure_format='retina'
plt.style.use('ggplot')

In [8]:

def set_seed(seed=123):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED']=str(seed)
set_seed(342)

In [9]:
train_ds=pd.read_csv('../input/spaceship-titanic/train.csv')
test_ds=pd.read_csv('../input/spaceship-titanic/test.csv')
train_ds.shape

In [117]:
test_ds.shape

In [10]:
train_ds.head()

In [11]:
train_ds.columns

In [12]:
train_ds.dtypes

In [13]:
plt.hist(train_ds.select_dtypes(include=['float64']),bins =4)

In [14]:
train_ds.isna().sum()

In [15]:
#eda -outliers (IQR)

In [16]:
def detect_outliers(df,n,features):
    for cols in features:
        q1 = np.percentile(df[cols],25)
        q3=np.percentile(df[cols],75)
        iqr=q3-q1
        iqr_step=1.5*iqr
        
        outlier_counters=[]
        
        outlier_list_cols=df[(df[cols]<q1 - iqr_step) | (df[cols]>q3 + iqr_step)].index
        outlier_counters.extend(outlier_list_cols)
        counter=Counter(outlier_counters)
        outliers=list(k for k,v in counter.items() if v>n)
    return outliers
    

In [17]:
outliers= detect_outliers(train_ds,2,['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])

In [18]:
outliers

In [19]:
train_ds.head()

In [20]:
#EDA HomePlanet:
sns.catplot(x='HomePlanet',y=np.arange(train_ds.shape[0]),kind='bar',hue='Transported',data=train_ds)
sns.catplot(x='HomePlanet',y='CryoSleep',kind='bar',hue='Transported',data=train_ds)
sns.catplot(x='HomePlanet',y='Age',kind='bar',hue='Transported',data=train_ds)
sns.catplot(x='HomePlanet',y='VIP',kind='bar',hue='Transported',data=train_ds)

sns.catplot(x='HomePlanet',y='RoomService',kind='bar',hue='Transported',data=train_ds)
sns.catplot(x='HomePlanet',y='FoodCourt',kind='bar',hue='Transported',data=train_ds)
sns.catplot(x='HomePlanet',y='ShoppingMall',kind='bar',hue='Transported',data=train_ds)
sns.catplot(x='HomePlanet',y='Spa',kind='bar',hue='Transported',data=train_ds)
sns.catplot(x='HomePlanet',y='VRDeck',kind='bar',hue='Transported',data=train_ds)

In [21]:
sns.distplot(train_ds.RoomService)

In [22]:
sns.distplot(train_ds.ShoppingMall)


In [23]:
sns.distplot(train_ds.Age)



In [24]:
train_ds.HomePlanet.mode()[0]

In [25]:
def fill_na(df,feature):
    for i in feature:
        df[i].fillna(df[i].mode()[0],inplace=True)
fill_na(train_ds,['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'])

In [26]:
train_ds.Age.fillna(train_ds.Age.median(),inplace=True)

In [27]:
train_ds.isna().sum()

In [28]:
train_ds.value_counts()

In [29]:
train_ds.describe()

In [30]:
sns.heatmap(train_ds[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].corr(),annot=True,cbar=True)

In [31]:
unique_values=[train_ds[x].unique() for x in train_ds.columns]
unique_values

In [32]:
train_ds.columns

In [33]:
def label_map(df,columns):
    for i in columns:
        df[i]=df[i].map(lambda x : 1 if (x==True) else 0)
    return df
label_map(train_ds,['CryoSleep','VIP','Transported'])

In [34]:
#eda cabin
train_ds['Cabin']=[x.split('/')[2] for x in train_ds.Cabin.astype(str)]

In [35]:
train_ds.drop('Name',axis=1,inplace=True)

In [36]:
train_ds.head()

In [40]:
# test data 
test_ds.head()

In [41]:
test_ds['travel_group']=[x.split('_')[1] for x in test_ds.PassengerId.astype(str)]

In [42]:
test_ds.drop('Name',axis=1,inplace=True)

In [43]:
fill_na(test_ds,['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])

In [44]:
test_ds.Age.fillna(test_ds.Age.median(),inplace=True)

In [45]:
test_ds.isna().sum()

In [46]:
label_map(test_ds,['CryoSleep','VIP'])

In [47]:
test_ds['Cabin']=[x.split('/')[2] for x in test_ds.Cabin.astype(str)]

In [48]:
test_ds.head()

In [49]:
train_ds.head()

In [50]:
y=train_ds.Transported
train_ds.drop('Transported',axis=1,inplace=True)

In [51]:
train_ds.head()

In [52]:
test_ds.head()

In [53]:
train_len=len(train_ds)
train_len

In [54]:
#onehot Encode
data=pd.concat([train_ds,test_ds],axis=0)


In [55]:
data_preprocessed=pd.get_dummies(data,columns=['HomePlanet','Cabin','Destination','travel_group'])

In [56]:
train_df=data_preprocessed[:train_len]
test_df=data_preprocessed[train_len:]

In [57]:
train_df.shape

In [58]:
test_df.shape

In [59]:
train_df.head()

In [60]:
test_df.head()

In [61]:
y

In [62]:
plt.rcParams["figure.figsize"] = (20,10)
sns.heatmap(train_df.corr(),annot=True)

In [63]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
train_scaled=scaler.fit_transform(train_df)
train_scaled=pd.DataFrame(train_scaled,columns=train_df.columns)

test_scaled=scaler.fit_transform(test_df)
test_scaled=pd.DataFrame(test_scaled,columns=test_df.columns)


In [64]:
train_scaled.drop('PassengerId',axis=1,inplace=True)
test_scaled.drop('PassengerId',axis=1,inplace=True)

In [65]:
test_scaled.head()

In [66]:
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,VotingClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

In [67]:
random=123
train_split=6954

In [68]:
train=train_scaled[:train_split]
y_train=y[:train_split]
test=train_scaled[train_split:]
y_test=y[train_split:]


In [69]:
print(train.shape)
print(y_train.shape)
print(test.shape)
print(y_test.shape)

In [70]:
cv=StratifiedKFold(n_splits=10,shuffle=True)

In [71]:
classifier=[]
classifier.append(SGDClassifier(random_state=random))
classifier.append(LogisticRegression(random_state=random))
classifier.append(KNeighborsClassifier())
classifier.append(SVC(random_state=random))
classifier.append(DecisionTreeClassifier(random_state=random))
classifier.append(GaussianNB())
classifier.append(MLPClassifier(random_state=random))
classifier.append(RandomForestClassifier(random_state=random))
classifier.append(GradientBoostingClassifier(random_state=random))
classifier.append(AdaBoostClassifier(random_state=random))
classifier.append(xgb.XGBClassifier(random_state=random))
classifier.append(CatBoostClassifier(random_state=random))

cv_results=[]
for i in classifier:
    cv_results.append(cross_val_score(i,train,y=y_train,scoring='accuracy',cv=cv,n_jobs=4))
    





In [122]:
cv_mean=[]
for i in cv_results:
    cv_mean.append(i.mean())
cv_df=pd.DataFrame({'Model':['SGDClassifier','LogisticRegression','KNeighborsClassifier','SVC','DecisionTreeClassifier','GaussianNB','MLPClassifier','RandomForestClassifier','GradientBoostingClassifier','AdaBoostClassifier','XGBClassifier','CatBoostClassifier'],
                   'score':[x.mean() for x in  cv_results]})

In [123]:
sns.catplot(x='score',y='Model',kind='bar',data=cv_df)

In [93]:
GBC=GradientBoostingClassifier(random_state=random)
params={'loss' : ["deviance"],
              'n_estimators' : [200,250],
              'learning_rate': [ 0.05, 0.0001],
              'max_depth': [4],
              'min_samples_leaf': [150,300],
              'max_features': [0.3,0.06] 
              }
gbcgscv=GridSearchCV(GBC,param_grid = params, cv=cv, scoring="accuracy", n_jobs= 4, verbose = 1)

gbcgscv.fit(train,y_train)

print('best score:', gbcgscv.best_score_)
print('best params:', gbcgscv.best_estimator_)

In [91]:
cbc=CatBoostClassifier()
param={'iterations':[500,800],'learning_rate':[0.05,0.001],
                             'depth':[10,20,25],
                             'eval_metric':['CrossEntropy','Logloss']
                             }
cbcgscv=GridSearchCV(cbc,param_grid=param,cv=cv,scoring='accuracy',n_jobs=5)
cbcgscv.fit(train,y_train)


In [94]:
print('best score:', cbcgscv.best_score_)
print('best params:',cbcgscv.best_estimator_)

In [98]:
votingc=VotingClassifier(estimators=[('gbc',gbcgscv.best_estimator_),('cbc',cbcgscv.best_estimator_)],voting='soft')
votingc.fit(train,y_train)

In [99]:
y_pred=votingc.predict(test)

In [100]:
from sklearn import metrics
metrics.confusion_matrix(y_test,y_pred)

In [101]:
print(metrics.classification_report(y_test,y_pred))

In [82]:
print(metrics.classification_report(y_test,y_pred))

In [102]:
metrics.roc_auc_score(y_test,y_pred)

In [103]:
t_prob=gbc.predict_proba(test)
fpr,tpr,_=metrics.roc_curve(y_test,t_prob[:,1])
roc_auc=metrics.auc(fpr,tpr)
roc_ds=pd.DataFrame({'fpr':fpr,'tpr':tpr})
ax = roc_ds.plot.line(x='fpr', y='tpr', title='ROC Curve', legend=False, marker = '.')
plt.plot([0, 1], [0, 1], '--')
ax.set_xlabel("False Postive Rate (FPR)")
ax.set_ylabel("True Positive Rate (TPR)")
plt.show();

In [116]:
predictions=votingc.predict(test_scaled)
len(predictions)

In [114]:
submission=pd.DataFrame()
submission['PassengerId']=test_ds.PassengerId
submission['Transported']=predictions
submission['Transported']=submission['Transported'].map(lambda x: 'True' if x==1 else 'False')

In [115]:
submission

In [118]:
submission.to_csv('submission.csv',index=False)