In [1]:
!pip install pandas scikit-learn matplotlib



In [2]:
import pandas as pd  # for working with data tables
from sklearn.model_selection import train_test_split  # Splits data into training/testing sets to evaluate performance
from sklearn.tree import DecisionTreeClassifier, plot_tree  # for building decision trees
from sklearn.metrics import accuracy_score, classification_report  # Metrics to measure how good our model is
import matplotlib.pyplot as plt  # Used for creating charts and visualizations

print("✓ Libraries imported successfully!")

✓ Libraries imported successfully!


In [3]:
# Assuming you have moved the file to your current folder as instructed
# Note: Ensure the file name matches exactly. Sometimes it is 'train.csv' or 'Titanic-Dataset.csv'

df = pd.read_csv("Titanic Dataset.csv") 

print("✓ Dataset loaded successfully!")
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")

✓ Dataset loaded successfully!
Number of rows: 1309
Number of columns: 14


In [4]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
df.tail()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0,0,0,2670,7.225,,C,,,
1308,3,0,"Zimmerman, Mr. Leo",male,29.0,0,0,315082,7.875,,S,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [7]:
total_passengers = len(df)
print(f"Total passengers in dataset: {total_passengers}")

Total passengers in dataset: 1309


In [8]:
print("Column names:")
print(df.columns)

Column names:
Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')


In [9]:
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881138,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.413493,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [10]:
len(df)

1309

In [12]:
print(df.columns)

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')


In [13]:
df['age'].mean()

np.float64(29.881137667304014)

In [15]:
print(f"Min Fare: {df['fare'].min()}")
print(f"Max Fare: {df['fare'].max()}")

Min Fare: 0.0
Max Fare: 512.3292


In [17]:
survivors = df['survived'].sum() 
# Note: Ensure column name is 'Survived' (capital S) or 'survived' (lowercase) based on 4b output
print(f"Number of survivors: {survivors}")

Number of survivors: 500


In [18]:
survival_rate = (survivors / total_passengers) * 100
print(f"Survival rate: {survival_rate:.2f}%")

Survival rate: 38.20%


In [19]:
# Show missing values per column
print(df.isnull().sum())

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [21]:
# Group by 'Survived' (0 or 1) and calculate mean Age
df.groupby('survived')['age'].mean()

survived
0    30.545363
1    28.918244
Name: age, dtype: float64

In [22]:
df['sex'].value_counts()

sex
male      843
female    466
Name: count, dtype: int64

In [23]:
# Group by Pclass and find the mean of Survived (which equals the percentage)
df.groupby('pclass')['survived'].mean()

pclass
1    0.619195
2    0.429603
3    0.255289
Name: survived, dtype: float64

In [None]:
# 1. To know what you are working with, need to find errors, outliers, missing values ect, and colum relationships.
# 2. Data wasent recorded or is lost.
# 3. its easy to learn and a good start, becuase its yes and no qustions, and has a small amount of stuff to it.
# 4. Probably sex and class, it was women and children first and then to make it more specfic you would then prioritize the higher class women and children over the lower ones.