In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
sns.set_style('darkgrid')

In [4]:
df_train = pd.read_csv('train.csv')

In [5]:
train = df_train.copy()

## EDA

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [7]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

#### Get Categorical Values

In [8]:
print('Categorical Values:')
print('')
for col in train.select_dtypes(include=['object']).columns:
    print(col)

Categorical Values:

PassengerId
HomePlanet
CryoSleep
Cabin
Destination
VIP
Name


In [9]:
categorical_vars = list(train.select_dtypes(include=['object']).columns)
categorical_vars

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'Name']

#### Replace categorical values with most common value
    set column equal to itself transformed
    call fillnam() to fill NaN values with...
    call the column itself.mode() - this returns a series with one item
    use index [0] to grab that item as str

In [10]:

#for cat_col in categorical_vars:
#    if cat_col != 'Name':
#        df[cat_col] = df[cat_col].fillna(df[cat_col].mode()[0])
    

#### Get Numerical Values

In [11]:
print('Numerical Values:')
print('')
for col in train._get_numeric_data().columns:
    print(col)

Numerical Values:

Age
RoomService
FoodCourt
ShoppingMall
Spa
VRDeck
Transported


In [12]:
numerical_vars = list(train._get_numeric_data().columns)
numerical_vars

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Transported']

#### Replace NaN in numerical with mean
    doesnt need index - mean() returns the ave as a float

In [13]:
#for num_col in numerical_vars:
#    df[num_col] = df[num_col].fillna(df[num_col].mean())

In [14]:
train['Age'].mean()

28.82793046746535

#### Show missing values

In [15]:
for col in train.columns:
    print(col, train[col].isnull().sum())

PassengerId 0
HomePlanet 201
CryoSleep 217
Cabin 199
Destination 182
Age 179
VIP 203
RoomService 181
FoodCourt 183
ShoppingMall 208
Spa 183
VRDeck 188
Name 200
Transported 0


#### Show missing values as a percentage of rows

In [16]:
for col in train.columns:
    print(col, train[col].isnull().sum()/len(train))

PassengerId 0.0
HomePlanet 0.023122052225928908
CryoSleep 0.02496261359714713
Cabin 0.02289198205452663
Destination 0.02093638559760727
Age 0.020591280340503854
VIP 0.023352122397331185
RoomService 0.02082135051190613
FoodCourt 0.021051420683308408
ShoppingMall 0.02392729782583688
Spa 0.021051420683308408
VRDeck 0.021626596111814105
Name 0.023007017140227768
Transported 0.0


In [17]:
len(train)

8693

In [18]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


#### describe for categorical data, O=object?

In [19]:
train.describe(include=['O'])

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
count,8693,8492,8476,8494,8511,8490,8493
unique,8693,3,2,6560,3,2,8473
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,False,Gollux Reedall
freq,1,4602,5439,8,5915,8291,2


## Preprocessing, Cleaning, Feat Engineering

In [20]:
def preprocessing(df):
    
    #Fill HomePlanet NaN values with 'Missing'
    df['HomePlanet'].fillna('Missing',inplace=True)
    
    #Fill CryoSleep NaN values with 'Missing'
    df['CryoSleep'].fillna('Missing',inplace=True)
    
    #Fill Cabin Missing values:
    df['Cabin'].fillna('Missing/Missing/Missing',inplace=True)
    #Separate Deck from Cabin column:
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    #Separate Side from Cabin column:
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2])
    #Drop Cabin column
    df = df.drop('Cabin',axis=1)
    
    #Fill Destination Missing values:
    df['Destination'].fillna('Missing',inplace=True)
    
    #Impute age w average age
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    
    #Fill VIP missing values with  Missing:
    df['VIP'].fillna('Missing',inplace=True)
    
    #Fill in 0 for Missing info of monetary columns:
    for cat in ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']:
        df[cat].fillna(0,inplace=True)
            
    #drop Name columns:
    df = df.drop('Name',axis=1)
    
    #drop PassengerId:
    df = df.drop('PassengerId',axis=1)
    
    #get dummies for categorical columns:
    cat_list = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP','Deck','Side']
    df = pd.get_dummies(data=df,columns=cat_list,drop_first=True)
    
    return df

In [21]:
abt = preprocessing(train)

In [22]:
abt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   Transported                8693 non-null   bool   
 7   HomePlanet_Europa          8693 non-null   uint8  
 8   HomePlanet_Mars            8693 non-null   uint8  
 9   HomePlanet_Missing         8693 non-null   uint8  
 10  CryoSleep_True             8693 non-null   uint8  
 11  CryoSleep_Missing          8693 non-null   uint8  
 12  Destination_Missing        8693 non-null   uint8  
 13  Destination_PSO J318.5-22  8693 non-null   uint8

## Train Test Split and Modeling

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [24]:
X = abt.drop('Transported',axis=1)
y = abt['Transported']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [27]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [28]:
svc_model = SVC(probability=True)
svc_model.fit(X_train,y_train)

svc_predictions = svc_model.predict(X_test)
print('SVC Results')
print('Accuracy:')
print(accuracy_score(y_test,svc_predictions))
print('Precision:')
print(precision_score(y_test,svc_predictions))
print('Recall:')
print(recall_score(y_test,svc_predictions))
print('')
print('')
lr_model = LogisticRegression(max_iter=1000,warm_start=True)
lr_model.fit(X_train,y_train)

lr_predictions = lr_model.predict(X_test)
print('LogReg Results')
print('Accuracy:')
print(accuracy_score(y_test,lr_predictions))
print('Precision:')
print(precision_score(y_test,lr_predictions))
print('Recall:')
print(recall_score(y_test,lr_predictions))

SVC Results
Accuracy:
0.7956288343558282
Precision:
0.7509702457956016
Recall:
0.8869365928189458


LogReg Results
Accuracy:
0.7967791411042945
Precision:
0.7866077998528329
Recall:
0.8166539343009931


In [34]:
from sklearn.metrics import accuracy_score, precision_score,recall_score,confusion_matrix

In [30]:
sgd = SGDClassifier(loss='modified_huber',penalty='l1')

In [31]:
sgd.fit(X_train,y_train)

SGDClassifier(loss='modified_huber', penalty='l1')

In [32]:
sgd_predictions = sgd.predict(X_test)

In [35]:
acc = accuracy_score(y_test,sgd_predictions)
pre = precision_score(y_test,sgd_predictions)
rec = recall_score(y_test,sgd_predictions)
print('Results: \n')
print(f'Accuracy: {acc}')
print(f'Precision: {pre}')
print(f'Recall: {rec}')
print('\n')
print('Confusion Matrix:')
print(confusion_matrix(sgd_predictions,y_test))

Results: 

Accuracy: 0.7664877300613497
Precision: 0.7684049079754601
Recall: 0.7654698242933538


Confusion Matrix:
[[ 997  307]
 [ 302 1002]]


In [36]:
print(confusion_matrix(sgd_predictions,y_test))

[[ 997  307]
 [ 302 1002]]


### SGDClassifier
    X_train.shape = (6085, 26) so index [0] = 6085
    

In [37]:
max_iter = np.ceil(10**6/X_train.shape[0])

clf = make_pipeline(StandardScaler(),
                   SGDClassifier(max_iter=max_iter,tol=1e-3))

clf.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier(max_iter=165.0))])

In [39]:
y_pred = clf.predict(X_test)

In [40]:
from sklearn.metrics import classification_report, confusion_matrix

In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.77      0.77      0.77      1299
        True       0.77      0.77      0.77      1309

    accuracy                           0.77      2608
   macro avg       0.77      0.77      0.77      2608
weighted avg       0.77      0.77      0.77      2608



In [42]:
print(confusion_matrix(y_test,y_pred))

[[ 994  305]
 [ 298 1011]]


In [43]:
y_pred

array([ True, False,  True, ..., False,  True, False])

In [44]:
print('acc:')
print(accuracy_score(y_test,y_pred)) 
print('prec:')
print(precision_score(y_test,y_pred))
print('rec:')
print(recall_score(y_test,y_pred))

acc:
0.7687883435582822
prec:
0.7682370820668692
rec:
0.7723453017570665


In [45]:
abt.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Missing,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Missing,Deck_T,Side_P,Side_S
0,39.0,0.0,0.0,0.0,0.0,0.0,False,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,True,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,False,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,16.0,303.0,70.0,151.0,565.0,2.0,True,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [46]:
X = abt.drop('Transported',axis =1)
y = abt['Transported']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rfc = RandomForestClassifier(n_estimators=1000)

In [50]:
rfc.fit(X_train,y_train)

RandomForestClassifier(n_estimators=1000)

In [51]:
rfc_pred = rfc.predict(X_test)

In [52]:
from sklearn.metrics import confusion_matrix,classification_report

In [53]:
confusion_matrix(y_test,rfc_pred)

array([[1128,  296],
       [ 322, 1123]], dtype=int64)

In [54]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

       False       0.78      0.79      0.78      1424
        True       0.79      0.78      0.78      1445

    accuracy                           0.78      2869
   macro avg       0.78      0.78      0.78      2869
weighted avg       0.78      0.78      0.78      2869

