In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Dataset
url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv'
df = pd.read_csv(url)

# Shape of the data
print("Shape of the DataFrame:", df.shape)

# Data types
print("\nData Types of Each Column:\n", df.dtypes)

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:\n", missing_values)

# Remove rows containing any missing value
df_cleaned = df.dropna()

# Checking the shape after dropping missing values
print("\nShape of the DataFrame after dropping missing values:", df_cleaned.shape)

# Display column names
print("\nColumn Names:\n", df_cleaned.columns)

# Select only numerical columns as features
numerical_features = df_cleaned.select_dtypes(include=['number'])

# Define the target variable
target_column = 'Transported'  # Ensure the column name matches
print(f"Looking for target column: '{target_column}'")

# Check if the target column exists
if target_column in df_cleaned.columns:
    target = df_cleaned[target_column]  # Assuming 'Transported' is the target variable
else:
    print(f"Column '{target_column}' not found in DataFrame.")
    target = None  # Set target to None if not found (and handle this case)

# If target is None, we shouldn't proceed with fitting the model
if target is not None:
    X = numerical_features  # Use all numerical features
    if target_column in X.columns:
        X = X.drop(columns=[target_column])  # Drop the target from features if present

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

    # Model Selection - Initialize a KNN instance
    knn = KNeighborsClassifier()

Shape of the DataFrame: (8693, 14)

Data Types of Each Column:
 PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Missing Values in Each Column:
 PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Shape of the DataFrame after dropping missing values: (6606, 14)

Column Names:
 Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transpo