In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [4]:
# Load the dataset
data = pd.read_csv('C:/Users/gerald zhao/Desktop/train.csv')  

In [5]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [12]:
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [13]:
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
cat_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

In [15]:
# Create preprocessing pipelines
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('onehot', cat_encoder)
])

num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', scaler)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipeline, categorical_cols),
        ('num', num_pipeline, numerical_cols)
    ])

In [16]:
# Splitting the data into features and target variable
X = data.drop('Transported', axis=1)
y = data['Transported']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Create a complete pipeline including the preprocessing and the classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', SVC())])

In [18]:
# Train the model
pipeline.fit(X_train, y_train)

In [19]:
# Make predictions
predictions = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7866589994249569


In [22]:
#import test.csv
test_data = pd.read_csv('C:/Users/gerald zhao/Desktop/test.csv')

# Use the same preprocessing and model pipeline to predict the test data
test_predictions = pipeline.predict(test_data)

# Create a DataFrame for submission
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

# Convert predictions to boolean as expected in the sample submission format
submission['Transported'] = submission['Transported'].astype(bool)

# Save the submission file
submission.to_csv('C:/Users/gerald zhao/Desktop/submission.csv', index=False)

In [50]:
def train_and_create_submission(kernel_type, C=1.0, degree=None, gamma=None):
    # Set up the classifier options
    classifier_options = {
        'kernel': kernel_type,
        'C': C  # Regularization parameter
    }
    if degree is not None and kernel_type == 'poly':
        classifier_options['degree'] = degree
    if gamma is not None:
        classifier_options['gamma'] = gamma

    # Create a pipeline with the specified kernel and options
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SVC(**classifier_options))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict and evaluate on the training test set
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f'Kernel: {kernel_type}, C: {C}, Degree: {degree if degree is not None else "Default"}, Gamma: {gamma if gamma is not None else "Default"}')
    print(f'Accuracy: {accuracy}')
    print(classification_report(y_test, predictions))
    
    # Predict on the actual test set
    test_predictions = pipeline.predict(test_data)
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'PassengerId': test_data['PassengerId'],
        'Transported': test_predictions.astype(bool)
    })
    
    # Save the submission file
    submission_filename = f'submission_{kernel_type}_C{C}_degree{degree}_gamma{gamma}.csv'
    submission.to_csv(submission_filename, index=False)
    print(f'Submission file created for {kernel_type} kernel with C {C}, degree {degree}, and gamma {gamma}: {submission_filename}')



In [None]:
#rbf kernel
train_and_create_submission('rbf', C=0.1, gamma=0.01)
train_and_create_submission('rbf', C=1, gamma=0.1)
train_and_create_submission('rbf', C=10, gamma=1)

In [None]:
#poly kernel
train_and_create_submission('poly', C=0.1, degree=2, gamma='scale')
train_and_create_submission('poly', C=1, degree=3, gamma='auto')
train_and_create_submission('poly', C=10, degree=4, gamma=0.5)


In [None]:
#sigmoid kernel
train_and_create_submission('sigmoid', C=0.1, gamma=0.01)
train_and_create_submission('sigmoid', C=1, gamma=0.1)
train_and_create_submission('sigmoid', C=10, gamma=1)


Below is an approach to output the different training results in one table for:
different kernels: ['rbf', 'poly', 'sigmoid']
different C_values: [0.1, 1, 10]
different gamma_values: ['scale', 'auto']
different degrees: [2, 3]  # Only used for poly

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


data = pd.read_csv('C:/Users/gerald zhao/Desktop/train.csv')

categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
cat_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()
# Create preprocessing pipelines
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('onehot', cat_encoder)
])

num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', scaler)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipeline, categorical_cols),
        ('num', num_pipeline, numerical_cols)
    ])
# Splitting the data into features and target variable
X = data.drop('Transported', axis=1)
y = data['Transported']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [61]:
kernels = ['rbf', 'poly', 'sigmoid']
C_values = [0.1, 1, 10]
gamma_values = ['scale', 'auto']
degrees = [2, 3]  # Only used for poly

# Function to evaluate models
def evaluate_models():
    results = []
    for kernel in kernels:
        for C in C_values:
            for gamma in gamma_values:
                if kernel == 'poly':
                    for degree in degrees:
                        classifier = SVC(kernel=kernel, C=C, gamma=gamma, degree=degree)
                        pipeline = Pipeline([
                            ('preprocessor', preprocessor),
                            ('classifier', classifier)
                        ])
                        pipeline.fit(X_train, y_train)
                        # Evaluation
                        train_preds = pipeline.predict(X_train)
                        val_preds = pipeline.predict(X_val)
                        results.append({
                            'Kernel': kernel,
                            'C': C,
                            'Gamma': gamma,
                            'Degree': degree,
                            'Training Accuracy': accuracy_score(y_train, train_preds),
                            'Validation Accuracy': accuracy_score(y_val, val_preds),
                            'Precision': precision_score(y_val, val_preds, average='macro'),
                            'Recall': recall_score(y_val, val_preds, average='macro')
                        })
                else:
                    classifier = SVC(kernel=kernel, C=C, gamma=gamma)
                    pipeline = Pipeline([
                        ('preprocessor', preprocessor),
                        ('classifier', classifier)
                    ])
                    pipeline.fit(X_train, y_train)
                    # Evaluation
                    train_preds = pipeline.predict(X_train)
                    val_preds = pipeline.predict(X_val)
                    results.append({
                        'Kernel': kernel,
                        'C': C,
                        'Gamma': gamma,
                        'Degree': 'N/A',
                        'Training Accuracy': accuracy_score(y_train, train_preds),
                        'Validation Accuracy': accuracy_score(y_val, val_preds),
                        'Precision': precision_score(y_val, val_preds, average='macro'),
                        'Recall': recall_score(y_val, val_preds, average='macro')
                    })

    # Creating DataFrame from results
    df_results_to_return = pd.DataFrame(results)
    return df_results_to_return

# Run the evaluation
df_results = evaluate_models()
print(df_results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


     Kernel     C  Gamma Degree  Training Accuracy  Validation Accuracy  Precision    Recall
0       rbf   0.1  scale    N/A           0.788755             0.772283   0.772614  0.772441
1       rbf   0.1   auto    N/A           0.504602             0.506038   0.752735  0.501161
2       rbf   1.0  scale    N/A           0.827006             0.786659   0.787378  0.786397
3       rbf   1.0   auto    N/A           0.774374             0.760782   0.762252  0.761130
4       rbf  10.0  scale    N/A           0.985476             0.779183   0.780928  0.778780
5       rbf  10.0   auto    N/A           0.757406             0.753306   0.761121  0.754132
6      poly   0.1  scale      2           0.779695             0.768258   0.770784  0.768713
7      poly   0.1  scale      3           0.781996             0.769408   0.771589  0.769830
8      poly   0.1   auto      2           0.503307             0.504888   0.252444  0.500000
9      poly   0.1   auto      3           0.503307             0.50488