In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Read the training data
train_data = pd.read_csv('src/data/train.csv')
print(f"Training data loaded successfully! Shape: {train_data.shape}\n")
test_data = pd.read_csv('src/data/test.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'src/data/train.csv'

In [2]:
print(train_data.head(), "\n")

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S   



In [3]:
print("Checking missing values...")
print(train_data.isnull().sum(), "\n")

Checking missing values...
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64 



In [4]:
print("Summary statistics:")
print(train_data.describe(), "\n")

Summary statistics:
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200   



In [5]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'
print('Selected features:', features, '\n', 'target:', target, '\n')
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
print('Imputed missing values in age with median, imputed missing values in embarked with mode.\n')
X = train_data[features]
y = train_data[target]
print(f"Feature matrix shape: {X.shape}, Target vector shape: {y.shape}\n")


Selected features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] 
 target: Survived 

Imputed missing values in age with median, imputed missing values in embarked with mode.

Feature matrix shape: (891, 7), Target vector shape: (891,)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)


In [6]:
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# Define preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

print("Preprocessing pipeline created!\n")

Preprocessing pipeline created!



In [7]:
print("Building logistic regression model...")

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


Building logistic regression model...


In [8]:
model.fit(X, y)
print("Model training complete!\n")

Model training complete!



In [9]:
y_pred_train = model.predict(X)
train_acc = accuracy_score(y, y_pred_train)
print(f"Training Accuracy: {train_acc:.4f}\n")


Training Accuracy: 0.8058



In [10]:
print("Loading test dataset...")
test_data = pd.read_csv("../data/test.csv")
print(f"Test data loaded successfully! Shape: {test_data.shape}\n")

Loading test dataset...
Test data loaded successfully! Shape: (418, 11)



In [11]:
print("Preprocessing test data...")
test_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
test_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

X_test = test_data[features]

Preprocessing test data...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [12]:
print("Predicting on test data...")
test_predictions = model.predict(X_test)

Predicting on test data...


In [13]:
output = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions
})
output.to_csv("../data/test_predictions.csv", index=False)
print("Test predictions saved to '../data/test_predictions.csv'\n")
print("=== SCRIPT COMPLETE ===")

Test predictions saved to '../data/test_predictions.csv'

=== SCRIPT COMPLETE ===
