In [91]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import numpy as np

In [92]:
# Step 1: Read the Data
train_df = pd.read_csv('training_titanic_x_y_train.csv')
test_df = pd.read_csv('test_titanic_x_test.csv')

In [93]:
# Handling missing values
train_df.ffill(inplace=True)
test_df.ffill(inplace=True)

In [94]:
# Identify categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

In [96]:
# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

In [103]:
# Fit encoder on training data
X_train_encoded = encoder.fit_transform(train_df[categorical_cols])

In [104]:
# Transform test data using the same encoder
X_test_encoded = encoder.transform(test_df[categorical_cols])



In [105]:
# Drop original categorical columns from the dataframes
X_train = train_df.drop(categorical_cols, axis=1)
X_test = test_df.drop(categorical_cols, axis=1)

In [106]:
# Ensure consistency in features between train and test data
# Align columns from train and test data
# Create DataFrames from the encoded arrays to ensure consistent column ordering
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out())
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out())

In [107]:
# Concatenate encoded columns with the remaining numeric columns
X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)


In [108]:
# Ensure the columns in the test data match those in the training data
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [109]:
# Define features and target variable
y_train = train_df['Survived']

In [110]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [111]:
# Train the Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [112]:
# Make Predictions
predictions = model.predict(X_test)

In [113]:
# Save Predictions
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])
predictions_df.to_csv('predictions.csv', index=False, header=False)