# Task 1 (Data Science Internship)
# Titanic Survival Project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the Titanic dataset from built-in datasets of seaborn
titanic = sns.load_dataset('titanic')

# Remove the missing values from 'age' and 'embarked' columns
titanic.dropna(subset=['age', 'embarked'], inplace=True)

# Perform one-hot encoding on categorical variables to convert them into numeric values
titanic = pd.get_dummies(titanic, drop_first=True)

X = titanic.drop(columns=['survived'])
Y = titanic['survived']

# Split the data into features X and target variable Y
# 20% used for testing, while the rest is used for training of the model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# If features deviate from each other and are not on the same scale, that's why apply StandardScaler technique
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train logistic regression model
modelTitanic = LogisticRegression()
modelTitanic.fit(X_train, Y_train)

# Prediction
Y_pred = modelTitanic.predict(X_test)

# Calculate the accuracy of the prediction
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Creating a bar chart for visualization of results
plt.figure(figsize=(6, 4))
sns.countplot(x='survived', data=titanic, palette='Set2')
plt.xticks([0, 1], ['Not Survived', 'Survived'])
plt.title('Titanic Passenger Survival')
plt.xlabel('Survival Status')
plt.ylabel('Count')
plt.show()