In [9]:
import pandas as pd
# Load the Titanic dataset
file_path = "Titanic-Dataset.csv"
df = pd.read_csv(file_path)
# Display basic information and first few rows
df.info(), df.head()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Drop unnecessary columns: PassengerId, Name, Ticket, and Cabin (too many missing values)
df_cleaned = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Fill missing Age values with median
imputer = SimpleImputer(strategy="median")
df_cleaned["Age"] = imputer.fit_transform(df_cleaned[["Age"]])

# Fill missing Embarked values with the most frequent category
df_cleaned["Embarked"].fillna(df_cleaned["Embarked"].mode()[0], inplace=True)

# Convert categorical features to numerical
label_encoders = {}
for col in ["Sex", "Embarked"]:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

# Separate features and target variable
X = df_cleaned.drop(columns=["Survived"])
y = df_cleaned["Survived"]

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned["Embarked"].fillna(df_cleaned["Embarked"].mode()[0], inplace=True)


((712, 7), (179, 7))

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

(0.8044692737430168,
 '              precision    recall  f1-score   support\n\n           0       0.82      0.86      0.84       105\n           1       0.78      0.73      0.76        74\n\n    accuracy                           0.80       179\n   macro avg       0.80      0.79      0.80       179\nweighted avg       0.80      0.80      0.80       179\n')