## Task 1: Titanic Classification
#### Making a system which tells whether the person will be save from sinking, and what factors were most likely lead to success-socio-economic status, age, gender and more.


In [1]:
import pandas as pd

# Loadign the dataset
data = pd.read_csv("titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Listing all columns
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
missing_values_count = data.isnull().sum()
missing_values_count

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# Drop columns that won't be used
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

from sklearn.impute import SimpleImputer

# Filling missing values
imputer = SimpleImputer(strategy='mean')
data['Age'] = imputer.fit_transform(data[['Age']])
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

missing_values = data.isnull().any()
missing_values

PassengerId    False
Survived       False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
dtype: bool

In [5]:
# Checking if columns are numerical or categorical
dtypes = data.dtypes
dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

### Encoding categorical features

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

dtypes = data.dtypes
dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int32
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked         int32
dtype: object

### Creating a model

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Defining features and target variable
X = data.drop('Survived', axis=1)
y = data['Survived']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizign features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Training the model

In [8]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# making prediction on test data
y_pred = model.predict(X_test)

### Evaluating the model

In [9]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.8100558659217877
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Confusion Matrix:
[[90 15]
 [19 55]]


### Features affecting the chances of survival


In [10]:
importance = model.coef_[0]
feature_names = X.columns

# Create a DataFrame to view feature importance
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', key=abs, ascending=False)

feature_importance_df

Unnamed: 0,Feature,Importance
2,Sex,-1.287758
1,Pclass,-0.778807
3,Age,-0.404354
4,SibSp,-0.341565
7,Embarked,-0.174754
6,Fare,0.126322
5,Parch,-0.109986
0,PassengerId,0.093784


### Converting the coefficiants to percentage

In [11]:
import numpy as np
feature_importance_df['Odds Ratio'] = np.exp(feature_importance_df['Importance'])

# Converting to percentage change in odds
feature_importance_df['Chances of survival in percentage'] = (feature_importance_df['Odds Ratio'] - 1) * 100
feature_importance_df[['Feature', 'Chances of survival in percentage']]

Unnamed: 0,Feature,Chances of survival in percentage
2,Sex,-72.411124
1,Pclass,-54.104688
3,Age,-33.259187
4,SibSp,-28.934263
7,Embarked,-16.03367
6,Fare,13.464696
5,Parch,-10.415306
0,PassengerId,9.832244


### So according to our model, the feature that is affecting chances of survival the most is "Sex"