In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Cell 1: Get unique categories from both datasets
train_categories = {}
test_categories = {}
categorical_columns = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

for col in categorical_columns:
    train_categories[col] = train_data[col].unique()
    test_categories[col] = test_data[col].unique()

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
# Find the number of missing values for each column.
missing_values_count = train_data.isnull().sum()
missing_values_count

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
# Fill missing values in train data categorical columns
categorical_columns = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
train_data[categorical_columns] = train_data[categorical_columns].fillna("Unknown")

# Fill missing values in train data numerical columns
numerical_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_data[numerical_columns] = train_data[numerical_columns].fillna(train_data[numerical_columns].median())

# Fill missing values in test data categorical columns
test_data[categorical_columns] = test_data[categorical_columns].fillna("Unknown")

# Fill missing values in test data numerical columns
test_data[numerical_columns] = test_data[numerical_columns].fillna(test_data[numerical_columns].median())

In [7]:
missing_values_count = train_data.isnull().sum()
missing_values_count

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64

In [8]:
# Convert Categorical Data to Numeric
train_encoded = pd.get_dummies(train_data, columns=categorical_columns, dtype=int)
test_encoded = pd.get_dummies(test_data, columns=categorical_columns, dtype=int)

# Align the columns of train_data and test_data
train_encoded, test_encoded = train_encoded.align(test_encoded, axis=1,fill_value=0)


In [9]:
train_encoded.head()

Unnamed: 0,Age,Cabin_A/0/P,Cabin_A/0/S,Cabin_A/1/P,Cabin_A/1/S,Cabin_A/10/P,Cabin_A/10/S,Cabin_A/100/S,Cabin_A/101/S,Cabin_A/102/S,...,Name,PassengerId,RoomService,ShoppingMall,Spa,Transported,VIP_False,VIP_True,VIP_Unknown,VRDeck
0,39.0,0,0,0,0,0,0,0,0,0,...,Maham Ofracculy,0001_01,0.0,0.0,0.0,False,1,0,0,0.0
1,24.0,0,0,0,0,0,0,0,0,0,...,Juanna Vines,0002_01,109.0,25.0,549.0,True,1,0,0,44.0
2,58.0,0,1,0,0,0,0,0,0,0,...,Altark Susent,0003_01,43.0,0.0,6715.0,False,0,1,0,49.0
3,33.0,0,1,0,0,0,0,0,0,0,...,Solam Susent,0003_02,0.0,371.0,3329.0,False,1,0,0,193.0
4,16.0,0,0,0,0,0,0,0,0,0,...,Willy Santantines,0004_01,303.0,151.0,565.0,True,1,0,0,2.0


In [10]:
# Check if any missing values exist
train_encoded.isnull().sum()

Age            0
Cabin_A/0/P    0
Cabin_A/0/S    0
Cabin_A/1/P    0
Cabin_A/1/S    0
              ..
Transported    0
VIP_False      0
VIP_True       0
VIP_Unknown    0
VRDeck         0
Length: 9849, dtype: int64

In [11]:
train_encoded.shape, test_encoded.shape

((8693, 9849), (4277, 9849))

In [12]:
# Define Features and Target Variable
X = train_encoded.drop(['PassengerId', 'Transported', 'Name'], axis=1)
y = train_encoded['Transported']

In [13]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#Scale numerical features
numerical_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [16]:
model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = XGBClassifier()
model_5 = LGBMClassifier()

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier()
}

# Dictionary to store the accuracy of each model
model_accuracies = {}

# Loop through each model, train it, and make predictions
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store the accuracy of the model
    model_accuracies[model_name] = accuracy

# Print the accuracies of all models
for model_name, accuracy in model_accuracies.items():
    print(f"{model_name}: Accuracy = {accuracy:.4f}")

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Logistic Regression: Accuracy = 0.7711
Decision Tree: Accuracy = 0.7602
Random Forest: Accuracy = 0.7867
XGBoost: Accuracy = 0.7832
LightGBM: Accuracy = 0.7844


In [18]:
model_3.fit(X_train, y_train)
y_pred = model_3.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.78953421506613


In [20]:
X_test_final = test_encoded.drop(['PassengerId', 'Name', 'Transported'], axis=1)
X_test_final[numerical_columns] = scaler.transform(X_test_final[numerical_columns])
test_predictions = model_3.predict(X_test_final)

In [22]:
# Create submission dataframe
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

# Save submission dataframe to csv file
submission.to_csv('submission.csv', index=False)

In [25]:
qample_submission = pd.read_csv('submission.csv')
print(qample_submission.head())

  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01        False


In [26]:
qample_submission.shape

(4277, 2)

In [21]:
sample_submission = pd.read_csv('sample_submission.csv')
print(sample_submission.head())

  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01        False
3     0021_01        False
4     0023_01        False
