In [98]:
import pandas as pd
import numpy as np
df = pd.read_csv(r'spaceship-titanic-data\train.csv')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Description of columns
- **train.csv** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
  - **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
  - **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
  - **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
  - **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
  - **Destination** - The planet the passenger will be debarking to.
  - **Age** - The age of the passenger.
  - **VIP** - Whether the passenger has paid for special VIP service during the voyage.
  - **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
  - **Name** - The first and last names of the passenger.
  - **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.
  
- **test.csv** - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.
- **sample_submission.csv** - A submission file in the correct format.
  - **PassengerId** - Id for each passenger in the test set.
  - **Transported** - The target. For each passenger, predict either True or False.

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [100]:
object_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Object Columns: {object_cols}")
numeric_cols = df.select_dtypes(exclude=['object']).columns.tolist()
print(f"Numeric Columns: {numeric_cols}")

Object Columns: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
Numeric Columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported']


In [101]:
# Analysing the Numerical Columns 
df[numeric_cols].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


Okay so it looks like all of them might have some outliers.

In [102]:
for col in object_cols:
    print(df[col].value_counts(dropna= False))
    print("-------------")

PassengerId
0001_01    1
0002_01    1
0003_01    1
0003_02    1
0004_01    1
          ..
9276_01    1
9278_01    1
9279_01    1
9280_01    1
9280_02    1
Name: count, Length: 8693, dtype: int64
-------------
HomePlanet
Earth     4602
Europa    2131
Mars      1759
NaN        201
Name: count, dtype: int64
-------------
CryoSleep
False    5439
True     3037
NaN       217
Name: count, dtype: int64
-------------
Cabin
NaN         199
G/734/S       8
G/1368/P      7
G/109/P       7
D/176/S       7
           ... 
E/56/P        1
A/98/P        1
G/1499/S      1
G/1500/S      1
G/1489/S      1
Name: count, Length: 6561, dtype: int64
-------------
Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: count, dtype: int64
-------------
VIP
False    8291
NaN       203
True      199
Name: count, dtype: int64
-------------
Name
NaN                  200
Troya Schwardson       2
Anton Woody            2
Apix Wala              2
Glenna Valezaley     

- PassengerId is useless since its unique
- Homeplanet, Cryosleep, Destination,VIP, are pretty much straightforward Encoding
- Name is interesting since we can maybe extract the last name and see if that has any significance
- Cabin is interesting too since it always takes the from deck/num/side so we can extract deck and side

In [103]:
# Extracting the Last name
df['Last_Name'] = df['Name'].str.strip().str.split().str[-1]
df['Deck'] = df['Cabin'].str.strip().str.split('/').str[0]
df['Side'] = df['Cabin'].str.strip().str.split('/').str[-1]
object_cols = df.select_dtypes(include=['object']).columns.tolist()
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Last_Name,Deck,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,Ofracculy,B,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,Vines,F,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,Susent,A,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,Susent,A,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,Santantines,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,Noxnuther,A,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,Mondalley,G,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,Connon,G,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,Hontichre,E,S


In [104]:
for col in object_cols:
    print(df[col].value_counts(dropna= False))
    print("-------------")

PassengerId
0001_01    1
0002_01    1
0003_01    1
0003_02    1
0004_01    1
          ..
9276_01    1
9278_01    1
9279_01    1
9280_01    1
9280_02    1
Name: count, Length: 8693, dtype: int64
-------------
HomePlanet
Earth     4602
Europa    2131
Mars      1759
NaN        201
Name: count, dtype: int64
-------------
CryoSleep
False    5439
True     3037
NaN       217
Name: count, dtype: int64
-------------
Cabin
NaN         199
G/734/S       8
G/1368/P      7
G/109/P       7
D/176/S       7
           ... 
E/56/P        1
A/98/P        1
G/1499/S      1
G/1500/S      1
G/1489/S      1
Name: count, Length: 6561, dtype: int64
-------------
Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: count, dtype: int64
-------------
VIP
False    8291
NaN       203
True      199
Name: count, dtype: int64
-------------
Name
NaN                  200
Troya Schwardson       2
Anton Woody            2
Apix Wala              2
Glenna Valezaley     

In [105]:
# keeping only the relevant columns for training
df = df.drop(columns = ['PassengerId', 'Name','Cabin'])
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Last_Name,Deck,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,Ofracculy,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,Vines,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,Susent,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,Susent,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,Santantines,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,Noxnuther,A,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,Mondalley,G,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,Connon,G,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,Hontichre,E,S


For The first Run I am going to ignore the spends.

In [106]:
df = df.drop(columns = ['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck','Last_Name'])
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,Deck,Side
0,Europa,False,TRAPPIST-1e,39.0,False,False,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,True,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,False,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,False,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,True,F,S
...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,False,A,P
8689,Earth,True,PSO J318.5-22,18.0,False,False,G,S
8690,Earth,False,TRAPPIST-1e,26.0,False,True,G,S
8691,Europa,False,55 Cancri e,32.0,False,False,E,S


In [107]:
df.isnull().sum()

HomePlanet     201
CryoSleep      217
Destination    182
Age            179
VIP            203
Transported      0
Deck           199
Side           199
dtype: int64

In [108]:
# For the first Run im just going to drop all NAs since there are plenty of rows left
df = df.dropna().copy()
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,Deck,Side
0,Europa,False,TRAPPIST-1e,39.0,False,False,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,True,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,False,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,False,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,True,F,S
...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,False,A,P
8689,Earth,True,PSO J318.5-22,18.0,False,False,G,S
8690,Earth,False,TRAPPIST-1e,26.0,False,True,G,S
8691,Europa,False,55 Cancri e,32.0,False,False,E,S


# Simple Linear Regression 

In [114]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Convert bool columns to int
df['CryoSleep'] = df['CryoSleep'].astype(int)
df['VIP'] = df['VIP'].astype(int)
df['Transported'] = df['Transported'].astype(int)

# Define numeric and categorical columns
numeric_cols = ['Age', 'CryoSleep', 'VIP']
categorical_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']

# Split features and target
X = df.drop(columns=['Transported'])
y = df['Transported']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # numeric columns stay as-is
)

# Fit and transform training data, transform test data
X_train_final = preprocessor.fit_transform(X_train)
X_test_final = preprocessor.transform(X_test)

# Initialize and fit Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_final, y_train)

# Predictions
y_pred = model.predict(X_test_final)
y_proba = model.predict_proba(X_test_final)[:, 1]

# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Print results
print(f"Accuracy: {acc}")
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)
print(f"ROC-AUC: {roc_auc}")


Accuracy: 0.7188118811881188
Confusion Matrix:
 [[614 161]
 [265 475]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.79      0.74       775
           1       0.75      0.64      0.69       740

    accuracy                           0.72      1515
   macro avg       0.72      0.72      0.72      1515
weighted avg       0.72      0.72      0.72      1515

ROC-AUC: 0.7688177855274629
