In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the Titanic dataset from a CSV file
url = '/content/Titanic-Dataset.csv'
titanic_data = pd.read_csv(url)

# Let's see how many missing values we have in each column
print("Missing values before imputation:")
print(titanic_data.isnull().sum())

# Fill in the missing ages with the median age
imputer_age = SimpleImputer(strategy='median')
titanic_data['Age'] = imputer_age.fit_transform(titanic_data[['Age']])

# Fill in missing 'Embarked' values with the most common one
imputer_embarked = SimpleImputer(strategy='most_frequent')
# Reshape the 'Embarked' column to the correct format for the imputer
titanic_data['Embarked'] = imputer_embarked.fit_transform(titanic_data[['Embarked']].values.reshape(-1, 1))[:, 0]

# Check again to make sure we've filled in all the missing values
print("Missing values after imputation:")
print(titanic_data.isnull().sum())

# Drop the 'Cabin', 'Name', and 'Ticket' columns because they have too many missing values or are not useful for our model
titanic_data = titanic_data.drop(columns=['Cabin', 'Name', 'Ticket'])

# Convert the 'Sex' and 'Embarked' columns to numeric values
label_encoder = LabelEncoder()
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = label_encoder.fit_transform(titanic_data['Embarked'])

# Separate our data into features (X) and target variable (y)
X = titanic_data.drop(columns=['Survived', 'PassengerId'])
y = titanic_data['Survived']

# Split our data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Use the trained model to make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate how well our model is doing
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print out the results of our model evaluation
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Missing values before imputation:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Missing values after imputation:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
Accuracy: 81.01%
Confusion Matrix:
[[90 15]
 [19 55]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

