<H1><b>GROUP 96: SPACESHIP TITANIC

<h3>Project by: Aryan Kheskani, Aryan Rajput, Gaelle Nehme, Gonzalo Lantero, Lama Abboud


In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
missing_values = train_data.isnull().sum() / len(train_data) * 100
missing_values 

<h1>How does applying Principal Component Analysis (PCA) to the dataset affect model performance, and what is the trade-off between the number of principal components retained and the model’s predictive accuracy?


In [3]:
train_data.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

<h1>Preprocessing and basic feature engineering<h1>

In [None]:
#Handle missing values - Fill numerical columns with median, categorical with mode
for col in train_data.columns:
    if train_data[col].dtype == 'object':  # Categorical columns
        train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    else:  # Numerical columns
        train_data[col].fillna(train_data[col].median(), inplace=True)

# Print dataset after missing value handling
print("\nDataset after handling missing values:")
print(train_data.head())

# Encode categorical variables
# encoder = LabelEncoder()
# for col in train_data.select_dtypes(include=['object']).columns:
#     train_data[col] = encoder.fit_transform(train_data[col])

In [None]:
# Define features (X) and target (y)
X = train_data.drop(columns=['Transported'])  # Features
y = train_data['Transported'].astype(int)  # Target (1 = Transported, 0 = Not Transported)

# Print dataset after encoding
print("\nDataset after encoding categorical variables:")
print(X.head())

In [None]:
# Split dataset into train & test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features for PCA
scaler = StandardScaler() # the scalar object is used to scale the data so that the data has mean = 0 and variance = 1
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print shape of training and test sets
print("\nShape of training data:", X_train.shape)
print("Shape of test data:", X_test.shape)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

In [None]:
# Plot cumulative explained variance to determine optimal number of components
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance Threshold')
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA - Explained Variance vs Number of Components")
plt.legend()
plt.show()


In [None]:
n_components = np.argmax(explained_variance_ratio >= 0.95) + 1
print(f"\nOptimal number of components to retain 95% variance: {n_components}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

# Apply PCA with optimal components
pca_optimal = PCA(n_components=n_components)
X_train_pca_optimal = pca_optimal.fit_transform(X_train_scaled)
X_test_pca_optimal = pca_optimal.transform(X_test_scaled)

# Train a Random Forest Classifier before and after PCA
rf_before = RandomForestClassifier(n_estimators=100, random_state=42)
rf_before.fit(X_train, y_train)

rf_after = RandomForestClassifier(n_estimators=100, random_state=42)
rf_after.fit(X_train_pca_optimal, y_train)

# Predictions
y_pred_before = rf_before.predict(X_test)
y_pred_after = rf_after.predict(X_test_pca_optimal)

# Evaluate model performance
metrics = {
    "Accuracy": [accuracy_score(y_test, y_pred_before), accuracy_score(y_test, y_pred_after)],
    "ROC AUC": [roc_auc_score(y_test, y_pred_before), roc_auc_score(y_test, y_pred_after)],
    "F1 Score": [f1_score(y_test, y_pred_before), f1_score(y_test, y_pred_after)]
}

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics, index=["Before PCA", "After PCA"])

# Print performance metrics
print("\nModel Performance Comparison:")
print(metrics_df)

<h1>apply advanced feature engineering<h1>

<h3>Apply one-hot encoding<h3>

In [5]:
# Identify categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns
print("\nCategorical columns:")
print(categorical_cols)

# Identify numerical columns
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
print("\nNumerical columns:")
print(numerical_cols)



Categorical columns:
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP'], dtype='object')

Numerical columns:
Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')


<h3>handle missing values<h3>

In [6]:
# Fill missing values for categorical features with the mode
for col in categorical_cols:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

for col in numerical_cols:
    train_data[col].fillna(train_data[col].median(), inplace=True)
    test_data[col].fillna(test_data[col].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(train_data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].mode()[0], inplace=True)
  train_data[col].fillna(train_data[col].mode()[0], inplace=True)
  test_data[col].fillna(test_data[col].m

In [9]:
from sklearn.preprocessing import OneHotEncoder

# initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

#fit and transform the data
encoded_train = encoder.fit_transform(train_data[categorical_cols])
encoded_test = encoder.transform(test_data[categorical_cols])

# Convert to DataFrame
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(categorical_cols))
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and merge encoded features
train_df = train_data.drop(columns=categorical_cols).reset_index(drop=True)
test_df = test_data.drop(columns=categorical_cols).reset_index(drop=True)

train_df = pd.concat([train_df, encoded_train_df], axis=1)
test_df = pd.concat([test_df, encoded_test_df], axis=1)

# Print dataset after encoding
print("\nDataset after encoding categorical variables:")
print(train_df.head())


Dataset after encoding categorical variables:
    Age  RoomService  FoodCourt  ShoppingMall     Spa  VRDeck  Transported  \
0  39.0          0.0        0.0           0.0     0.0     0.0        False   
1  24.0        109.0        9.0          25.0   549.0    44.0         True   
2  58.0         43.0     3576.0           0.0  6715.0    49.0        False   
3  33.0          0.0     1283.0         371.0  3329.0   193.0        False   
4  16.0        303.0       70.0         151.0   565.0     2.0         True   

   HomePlanet_Earth  HomePlanet_Europa  HomePlanet_Mars  CryoSleep_False  \
0               0.0                1.0              0.0              1.0   
1               1.0                0.0              0.0              1.0   
2               0.0                1.0              0.0              1.0   
3               0.0                1.0              0.0              1.0   
4               1.0                0.0              0.0              1.0   

   CryoSleep_True  Destinat

<h3>Apply scaling<h3>

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()


# Apply Standardization
X_standardized = train_df.copy()
X_standardized[numerical_cols] = standard_scaler.fit_transform(train_df[numerical_cols])

# Apply Normalization
X_normalized = train_df.copy()
X_normalized[numerical_cols] = minmax_scaler.fit_transform(train_df[numerical_cols])

# Print dataset after scaling
print("\nDataset after scaling:")
print(X_standardized.head())



Dataset after scaling:
        Age  RoomService  FoodCourt  ShoppingMall       Spa    VRDeck  \
0  0.711945    -0.333105  -0.281027     -0.283579 -0.270626 -0.263003   
1 -0.334037    -0.168073  -0.275387     -0.241771  0.217158 -0.224205   
2  2.036857    -0.268001   1.959998     -0.283579  5.695623 -0.219796   
3  0.293552    -0.333105   0.523010      0.336851  2.687176 -0.092818   
4 -0.891895     0.125652  -0.237159     -0.031059  0.231374 -0.261240   

   Transported  HomePlanet_Earth  HomePlanet_Europa  HomePlanet_Mars  \
0        False               0.0                1.0              0.0   
1         True               1.0                0.0              0.0   
2        False               0.0                1.0              0.0   
3        False               0.0                1.0              0.0   
4         True               1.0                0.0              0.0   

   CryoSleep_False  CryoSleep_True  Destination_55 Cancri e  \
0              1.0             0.0       

<h1>testing on different modelling approaches<h1>

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

#target variable = what we are trying to predict
target = 'Transported'
X = train_df.drop(columns=[target])
y = train_df[target]

# Split dataset into train & test sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Train & evaluate each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    results.append({"Model": name, "Validation Accuracy": accuracy})

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Performance on Validation Set:")
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model Performance on Validation Set:
                    Model  Validation Accuracy
0     Logistic Regression             0.772858
1           Random Forest             0.772858
2  Support Vector Machine             0.774008
3     K-Nearest Neighbors             0.773433
4             Naive Bayes             0.733755
5       Gradient Boosting             0.780909
