<a href="https://colab.research.google.com/github/Debayan2004/CODSOFT/blob/main/SpaceshipTitanicPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle

# Make a directory for the Kaggle API key
!mkdir -p ~/.kaggle

# Copy the kaggle.json file to the correct location
!cp kaggle.json ~/.kaggle/

# Set permissions for the key
!chmod 600 ~/.kaggle/kaggle.json




In [None]:
!kaggle competitions download -c spaceship-titanic

Downloading spaceship-titanic.zip to /content
100% 299k/299k [00:00<00:00, 557kB/s]
100% 299k/299k [00:00<00:00, 557kB/s]


In [None]:
import zipfile
import os

def unzip_file(zip_path, extract_to='.'):
    """
    Unzips a zip file to the specified directory.

    Parameters:
    zip_path (str): Path to the zip file.
    extract_to (str): Directory to extract files to. Default is current directory.

    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted all files to {extract_to}")

# Example usage
zip_path = '/content/spaceship-titanic.zip'
extract_to = '/content/'

unzip_file(zip_path, extract_to)


Extracted all files to /content/


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingClassifier

In [None]:
# Load datasets
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
submission = pd.read_csv('/content/sample_submission.csv')

In [None]:
def preprocess_data(df):
    # Fill missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['CryoSleep'].fillna(False, inplace=True)
    df['VIP'].fillna(False, inplace=True)
    df['HomePlanet'].fillna('Earth', inplace=True)
    df['Destination'].fillna('TRAPPIST-1e', inplace=True)
    # Fill missing values in 'Cabin' with 'Unknown/0/Unknown' to maintain the splittable format
    df['Cabin'].fillna('Unknown/0/Unknown', inplace=True)  # Change this line

    # Feature engineering
    df['CabinDeck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else 'Unknown')
    df['CabinNum'] = df['Cabin'].apply(lambda x: x.split('/')[1] if pd.notna(x) else '0')
    df['CabinSide'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else 'Unknown')
    # Create interaction features
    df['Age*VIP'] = df['Age'] * df['VIP']
    df['RoomService*FoodCourt'] = df['RoomService'] * df['FoodCourt']

    # Drop irrelevant features
    df.drop(columns=['Name', 'Cabin', 'PassengerId'], inplace=True)

    # Encode categorical variables
    le = LabelEncoder()
    for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'CabinSide']:
        df[col] = le.fit_transform(df[col])

    return df

In [None]:
# Preprocess the train and test data
train_processed = preprocess_data(train.copy())
test_processed = preprocess_data(test.copy())

In [None]:
# Separate features and target
X = train_processed.drop(columns=['Transported'])
y = train_processed['Transported']

In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # or other strategies like 'median', 'most_frequent'
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
test_processed = imputer.transform(test_processed)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_processed = scaler.transform(test_processed)


In [None]:
# Hyperparameter tuning with RandomizedSearchCV for GradientBoostingClassifier
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.8, 0.9, 1.0]
}

gbc = GradientBoostingClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=gbc, param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best model from RandomizedSearchCV
best_gbc = random_search.best_estimator_

In [None]:
# Hyperparameter tuning with RandomizedSearchCV for RandomForestClassifier
param_dist_rf = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rfc = RandomForestClassifier(random_state=42)
random_search_rf = RandomizedSearchCV(estimator=rfc, param_distributions=param_dist_rf, n_iter=50, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
random_search_rf.fit(X_train, y_train)

# Best model from RandomizedSearchCV
best_rfc = random_search_rf.best_estimator_

In [None]:
# Ensemble with VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('gbc', best_gbc),
    ('rfc', best_rfc)
], voting='soft')

voting_clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = voting_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

Validation Accuracy: 0.80


In [None]:
# Predict on test data
test_predictions = voting_clf.predict(test_processed)

# Save predictions to submission file
submission['Transported'] = test_predictions
submission.to_csv('submission1.csv', index=False)
print('Submission file saved.')

Submission file saved.
