# Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display 
from sklearn.metrics import accuracy_score
%matplotlib inline

# Loading and Preparing the Dataset

In [2]:
full_data = pd.read_csv(r"D:\AMIT Course\Data Set\Titanic\titanic_data.csv")
display(full_data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Feature Selection and Preprocessing

In [3]:
y = full_data['Survived']
x= full_data.drop(['Survived','PassengerId','Name','Ticket','Cabin'], axis = 1)
display(x.head())

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [4]:
x = pd.get_dummies(x)

In [5]:
x = x.fillna(0.0)
display(x.head())

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,False,True,False,False,True
1,1,38.0,1,0,71.2833,True,False,True,False,False
2,3,26.0,0,0,7.925,True,False,False,False,True
3,1,35.0,1,0,53.1,True,False,False,False,True
4,3,35.0,0,0,8.05,False,True,False,False,True


# Checking Unique Values in Certain Features

In [6]:
print(x['SibSp'].unique())

[1 0 3 4 2 5 8]


In [7]:
print(x['Parch'].unique())

[0 1 2 5 3 4 6]


In [8]:
print(x['Age'].unique())

[22.   38.   26.   35.    0.   54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]


In [9]:
print(x['Pclass'].unique())

[3 1 2]


# Visualizing Survival Counts by Gender

In [10]:
female_data=full_data[full_data["Sex"] == "female"]
sns.countplot(data=female_data,x='Survived')
plt.xlabel("number of passengers who survived and not survived with respect they are all females")

Text(0.5, 0, 'number of passengers who survived and not survived with respect they are all females')

In [11]:
male_data=full_data[full_data["Sex"] == "male"]
sns.countplot(data=male_data,x='Survived')
plt.xlabel("number of passengers who survived and not survived with respect they are all males")

Text(0.5, 23.52222222222222, 'number of passengers who survived and not survived with respect they are all males')

# Side-by-Side Gender Survival Visualization

In [12]:
plt.figure(figsize = [10, 5])
plt.subplot(1, 2, 1) 
sns.countplot(data=male_data,x='Survived')
plt.xlabel("all males")
plt.subplot(1, 2, 2)
sns.countplot(data=female_data,x='Survived')
ylabel=np.arange(0,600,100)
plt.yticks(ylabel)
plt.xlabel("all females")

Text(0.5, 0, 'all females')

# Exploring Column Names

In [13]:
print(full_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


# First Prediction Rule – Gender Based

In [14]:
def predictions_1(data):
    predictions = []
    for i, passenger in data.iterrows():
        if passenger["Sex"] == "female":
            predictions.append(1)
        else:
            predictions.append(0)
    return pd.Series(predictions)

x = full_data  
predictions = predictions_1(x)


In [15]:
print(accuracy_score(y, predictions))

0.7867564534231201


# Visualizing Male Age vs Survival

In [16]:
male_data_nsurv=full_data[(full_data["Sex"] == "male") &(y == 0)] 
male_data_surv=full_data[(full_data["Sex"] == "male") & (y == 1)]  
bins=np.arange(0,full_data['Age'].max()+10,10)
plt.hist(data=male_data_nsurv,x='Age',bins=bins,color="red")
plt.hist(data=male_data_surv,x='Age',bins=bins,color="blue")
plt.xlabel("males survival rate with respect to age ")

Text(0.5, 25.722222222222214, 'males survival rate with respect to age ')

# Second Prediction Rule – Add Age Condition for Males

In [17]:
def predictions_2(data):
    predictions = []
    for i,passenger in data.iterrows():
        if (passenger['Sex']=='female') or (passenger['Sex']=='male'and passenger['Age']<10):
            
            predictions.append(1)
        else:
            predictions.append(0)
    
    return pd.Series(predictions)
predictions = predictions_2(x)

In [18]:
print(accuracy_score(y, predictions))

0.7934904601571269


# Visualizing Survival of Males < 18

In [19]:
male_data_nsurv=full_data[(full_data["Sex"] == "male") & (full_data["Survived"] == 0) & (full_data["Age"] < 18)] # red
male_data_surv=full_data[(full_data["Sex"] == "male") & (full_data["Survived"] == 1) & (full_data["Age"] < 18)]  # blue
bins=np.arange(0,25,10)
plt.hist(data=male_data_nsurv,x='Age',bins=bins,color="red")
plt.hist(data=male_data_surv,x='Age',bins=bins,color="blue")
plt.xlabel("males survival rate with respect to age < 18 ")

Text(0.5, 25.722222222222214, 'males survival rate with respect to age < 18 ')

# Female Survival vs. Parch Feature

In [20]:
female_data_parch=full_data[(full_data["Sex"] == "female") & (full_data["Parch"] == 1 ) ]
sns.countplot(data=female_data_parch,x='Survived')
plt.xlabel("number of females passengers who survived and not survived with respect to the number of parch = 1")

Text(0.5, 25.722222222222214, 'number of females passengers who survived and not survived with respect to the number of parch = 1')

In [21]:
female_data_parch=full_data[(full_data["Sex"] == "female") & (full_data["Parch"] == 2 ) ]
sns.countplot(data=female_data_parch,x='Survived')
plt.xlabel("number of females passengers who survived and not survived with respect to the number of parch =2")

Text(0.5, 25.722222222222214, 'number of females passengers who survived and not survived with respect to the number of parch =2')

In [22]:
female_data_parch=full_data[(full_data["Sex"] == "female") & (full_data["Parch"] == 3 ) ]
sns.countplot(data=female_data_parch,x='Survived')
plt.xlabel("number of females who survived and not with respect to the number of parch =3")

Text(0.5, 25.722222222222214, 'number of females who survived and not with respect to the number of parch =3')

In [23]:
female_data_parch=full_data[(full_data["Sex"] == "female") & (full_data["Parch"] == 4 ) ]
sns.countplot(data=female_data_parch,x='Survived')
plt.xlabel("number of females passengers who survived and not survived with respect to the number of parch =4")

Text(0.5, 25.722222222222214, 'number of females passengers who survived and not survived with respect to the number of parch =4')

In [24]:
female_data_parch=full_data[(full_data["Sex"] == "female") & (full_data["Parch"] >= 4) ]
sns.countplot(data=female_data_parch,x='Survived')
plt.xlabel("number of females passengers who survived and not survived with respect to the number of parch >= 4")

Text(0.5, 25.722222222222214, 'number of females passengers who survived and not survived with respect to the number of parch >= 4')

In [25]:
female_data_parch=full_data[(full_data["Sex"] == "female") & (full_data["Parch"] < 4 ) ]
sns.countplot(data=female_data_parch,x='Survived')
plt.xlabel("number of females passengers who survived and not survived with respect to the number of parch <4")

Text(0.5, 25.722222222222214, 'number of females passengers who survived and not survived with respect to the number of parch <4')

# Male Survival vs. Passenger Class

In [26]:
male_data_class=full_data[(full_data["Sex"] == "male") & (full_data["Pclass"] == 1)]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (1)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (1)')

In [27]:
male_data_class=full_data[(full_data["Sex"] == "male") & (full_data["Pclass"] == 2)]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (2)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (2)')

In [28]:
male_data_class=full_data[(full_data["Sex"] == "male") & (full_data["Pclass"] == 3)]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (3)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (3)')

# Male Survival vs. Embarked Port (Incorrectly Used Data)

In [29]:
embarked_class=full_data[(full_data["Sex"] == "male") & (full_data["Embarked"] == 'Q')]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (3)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (3)')

# Male Survival with SibSp ≤ 3

In [30]:
embarked_class=full_data[(full_data["Sex"] == "male") & (full_data["Embarked"] == 'S')]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (3)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (3)')

In [31]:
embarked_class=full_data[(full_data["Sex"] == "male") & (full_data["Embarked"] == 'C')]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (3)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (3)')

In [32]:
embarked_class=full_data[(full_data["Sex"] == "male") & (full_data["Embarked"] == '0')]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (3)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (3)')

In [33]:
embarked_class=full_data[(full_data["Sex"] == "male") & (full_data["SibSp"] <= 3)]
sns.countplot(data=male_data_class,x='Survived')
plt.xlabel("number of males passengers who survived and not survived with respect they are all in Pclass (3)")

Text(0.5, 25.722222222222214, 'number of males passengers who survived and not survived with respect they are all in Pclass (3)')

# Visualizing Age Distribution for Males in Pclass = 1

In [34]:
male_data_class_n=full_data[(full_data["Sex"] == "male") & (full_data["Pclass"] == 1) & (full_data["Survived"] == 0)] #red
male_data_class_s=full_data[(full_data["Sex"] == "male") & (full_data["Pclass"] == 1) & (full_data["Survived"] == 1)] #blue
bins=np.arange(0,full_data['Age'].max()+10,10)
plt.hist(data=male_data_class_n,x='Age',bins=bins,color="red")
plt.hist(data=male_data_class_s,x='Age',bins=bins,color="blue")
plt.xlabel("males survival rate with respect to age and class(1)")

Text(0.5, 25.722222222222214, 'males survival rate with respect to age and class(1)')

# Third Prediction Rule – Advanced Conditions

In [35]:
def predictions_3(data):
    predictions = []
    for i, passenger in data.iterrows():
        if (passenger['Sex']=='female' and passenger['Parch']<4) or(passenger['Sex']=='male' and passenger['Age']<10) or(passenger['Sex']=='male'and (30<passenger['Age']<40)and passenger['Pclass']==1):
            predictions.append(1)
        else:
            predictions.append(0)
    
    return pd.Series(predictions)
predictions = predictions_3(x)

In [36]:
print(accuracy_score(y, predictions))

0.8013468013468014


# description 

# Titanic Survival Prediction: Exploratory Data Analysis & Rule-Based Modeling
#### This project analyzes the Titanic passenger dataset to identify key survival patterns and implement rule-based prediction models. It is an exploratory and educational project that applies data preprocessing, visualization, and basic algorithmic logic to understand how various demographic and travel-related factors influenced passenger survival during the Titanic disaster.

## Project Objectives
#### Perform exploratory data analysis (EDA) using visualizations to explore relationships between features like sex, age, class, and survival.

#### Clean and transform the data using pandas and one-hot encoding.

#### Develop and evaluate simple rule-based prediction models based on observed patterns in the dataset.

#### Measure model performance using accuracy score from sklearn.metrics.

## Key Features
### Data Preprocessing:

#### Removed non-informative features such as Name, Ticket, Cabin, and PassengerId.

#### One-hot encoded categorical variables.

#### Filled missing values with appropriate default values (e.g., 0.0).

### Visualization:

#### Plotted survival distributions across sex, age groups, number of parents/children aboard (Parch), class (Pclass), and embarkation point (Embarked).

#### Used matplotlib and seaborn to create insightful visual comparisons.

### Modeling:

#### Implemented 3 simple rule-based models to predict survival:

#### Model 1: All females survived; all males did not.

#### Model 2: All females and males under 10 years old survived.

#### Model 3: Includes females with fewer than 4 parents/children, males under 10, and males aged 30–40 in 1st class.

### Evaluation:

#### Achieved accuracy improvements with each model iteration:

#### Model 1: ~78.7%

#### Model 2: ~79.3%

#### Model 3: ~80.1%

## Dataset
#### Source: titanic_data.csv

#### Columns used: Survived, Sex, Age, Pclass, SibSp, Parch, Embarked

## Technologies Used
#### Python 3.x

#### Pandas, NumPy for data manipulation

#### Seaborn, Matplotlib for visualization

#### Scikit-learn for evaluation metrics

## How to Run
### Install required libraries:

#### bash
#### Copy
#### Edit
#### pip install pandas numpy matplotlib seaborn scikit-learn
#### Ensure the dataset path is correct:

#### python
#### Copy
#### Edit
#### full_data = pd.read_csv("path/to/titanic_data.csv")
#### Run the script in a Jupyter Notebook environment for full visualization support.

## Insights
#### This project demonstrates that even simple rule-based logic can yield surprisingly competitive results on binary classification problems, especially when guided by exploratory visual analysis. It also lays the groundwork for more complex machine learning models.