In [26]:
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
df = pd.read_csv('Titanic.csv')

In [28]:
print("First 5 rows of the dataset:")
df.head(10)

First 5 rows of the dataset:


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
0,1,3,Allison Hill,male,17,4,2,43d75413-a939-4bd1-a516-b0d47d3572cc,144.08,Q,1
1,2,1,Noah Rhodes,male,60,2,2,6334fa2a-8b4b-47e7-a451-5ae01754bf08,249.04,S,0
2,3,3,Angie Henderson,male,64,0,0,61a66444-e2af-4629-9efb-336e2f546033,50.31,Q,1
3,4,3,Daniel Wagner,male,35,4,0,0b6c03c8-721e-4419-afc3-e6495e911b91,235.2,C,1
4,5,1,Cristian Santos,female,70,0,3,436e3c49-770e-49db-b092-d40143675d58,160.17,C,1
5,6,1,Connie Lawrence,male,30,2,3,907276ea-44c0-4a9a-964a-cb1f39eb038a,412.97,Q,1
6,7,3,Abigail Shaffer,female,71,4,0,b24beb96-c092-4b74-a6b0-7ff60fc79913,37.92,C,1
7,8,2,Gina Moore,female,74,2,0,ab0b4cc5-15f0-4a19-98c1-80d2aadf3692,215.12,Q,0
8,9,3,Gabrielle Davis,male,63,4,1,69fb0763-8a9c-46f7-8ee4-e5125802b640,234.63,C,1
9,10,3,Ryan Munoz,female,28,2,1,3a3e46b0-96f8-4ff7-81a7-2883f8802878,365.25,S,1


<!--  PassengerId: Unique ID for each passenger
 Passenger class: Class of the passenger (1 = First class, 2 = Second class, 3 = Third class)
 Name: Name of the passenger
 Sex: Gender of the passenger (male or female)
 Age: Age of the passenger
 SibSp: Number of siblings/spouses aboard the Titanic
 Parch: Number of parents/children aboard the Titanic
 Ticket: Ticket number of the passenger
 Fare: Fare paid by the passenger
 Embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
 Survived: Survival status (1 = Survived, 0 = Did not survive) -->

In [29]:
print("\nUnique values in 'Embarked':")
df['Embarked'].unique()


Unique values in 'Embarked':


array(['Q', 'S', 'C'], dtype=object)

In [30]:
print("\nMissing values in each column:")
df.isnull().sum()


Missing values in each column:


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Survived       0
dtype: int64

In [31]:
print("\nDataset info:")
df.info()


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1000 non-null   int64  
 1   Pclass       1000 non-null   int64  
 2   Name         1000 non-null   object 
 3   Sex          1000 non-null   object 
 4   Age          1000 non-null   int64  
 5   SibSp        1000 non-null   int64  
 6   Parch        1000 non-null   int64  
 7   Ticket       1000 non-null   object 
 8   Fare         1000 non-null   float64
 9   Embarked     1000 non-null   object 
 10  Survived     1000 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 86.1+ KB


In [32]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [33]:
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

In [34]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

In [35]:
df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

In [36]:
X = pd.get_dummies(df.drop('Survived',axis=1))
y = df['Survived']

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [38]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Accuracy on test data:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy on test data: 0.53

Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.54      0.55       160
           1       0.50      0.52      0.51       140

    accuracy                           0.53       300
   macro avg       0.53      0.53      0.53       300
weighted avg       0.53      0.53      0.53       300


Confusion Matrix:
 [[86 74]
 [67 73]]


In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(12, 8), dpi=1000)
plot_tree(model, feature_names=X.columns, class_names=["Did not Survive", "Survived"], filled=True)
plt.show()

In [1]:
# [passenger_class, Sex, Age, SibSp, Parch, Fare,Embarked]
new_data = [[3, 1, 21, 0, 1, 200, 2]]
new_data = pd.DataFrame(new_data, columns=X.columns)
prediction = model.predict(new_data)
print("Prediction:", prediction)


NameError: name 'pd' is not defined