In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Step 1: Read the Data
train_df = pd.read_csv('training_titanic_x_y_train.csv')
test_df = pd.read_csv('test_titanic_x_test.csv')

# Step 2: Feature Engineering (e.g., extract titles from names, create family size)
def feature_engineering(df):
    # Extract titles from the names
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    
    # Create family size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Drop columns not useful for prediction
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# Step 3: Handle missing values and One-Hot Encoding
# Identify categorical columns
categorical_cols = ['Sex', 'Embarked', 'Title']

# Pipeline for numeric features (imputation + scaling)
numeric_features = train_df.drop(['Survived'] + categorical_cols, axis=1).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Pipeline for categorical features (imputation + one-hot encoding)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_cols)])

# Step 4: Model Training with Hyperparameter Tuning
# Logistic Regression with GridSearchCV for hyperparameter tuning
param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'model__solver': ['liblinear', 'lbfgs'],
    'model__penalty': ['l2'],
}

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LogisticRegression(max_iter=1000))])

# Grid search for best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']

grid_search.fit(X_train, y_train)

# Step 5: Make Predictions
best_model = grid_search.best_estimator_
X_test = test_df.copy()
predictions = best_model.predict(X_test)

# Save Predictions
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])
predictions_df.to_csv('predictions.csv', index=False, header=False)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


