In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the Titanic dataset 
titanic_data = pd.read_csv("/kaggle/input/test-file/tested.csv")
titanic_data.shape

In [None]:
titanic_data.head(10)

In [None]:
column_name = 'Embarked'
unique_values = titanic_data[column_name].unique()
print(f"Unique values in the '{column_name}' column:")
print(unique_values)

In [None]:
titanic_data.isnull().count()

In [None]:
# Handle missing values
numeric_features = ['Age', 'Fare']
categorical_features = ['Embarked', 'Sex', 'Pclass']

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values for numeric features with the mean
numeric_imputer = SimpleImputer(strategy='mean')
titanic_data[numeric_features] = numeric_imputer.fit_transform(titanic_data[numeric_features])

In [None]:
# Impute missing values for categorical features with the most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
titanic_data[categorical_features] = categorical_imputer.fit_transform(titanic_data[categorical_features])

In [None]:
# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = label_encoder.fit_transform(titanic_data['Embarked'])

In [None]:
# Drop columns that are not needed or present challenges
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
titanic_data = titanic_data.drop(columns=columns_to_drop)

In [None]:
# Prepare features (X) and target variable (y)
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

In [None]:
# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create and train the Naive Bayes model
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = naive_bayes_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rep}")

In [None]:
# Plotting Pairplot for preprocessed data
sns.pairplot(titanic_data, hue='Survived', diag_kind='kde')
plt.suptitle('Pairplot of Titanic Data')
plt.show()

In [None]:
# Plotting Correlation Heatmap for preprocessed data
correlation_matrix = titanic_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap for Titanic Data')
plt.show()

In [None]:
# Countplot for the target variable in the original data
plt.figure(figsize=(6, 4))
sns.countplot(x='Survived', data=titanic_data)
plt.title('Count of Survived and Not Survived in Titanic Data')
plt.show()