## Preethi's Analysis

In my portion of the analysis, I will be doing discriminant analysis to try and predict survival from the titanic dataset

In [7]:
import pandas as pd

titanic = pd.read_csv("titanic_augmented.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,is_alone,ticket_group_size,fare_per_person,age_fare_ratio,cabin_deck,cabin_room_number,booking_reference,service_id,cabin_score,name_word_count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,1,7.25,3.034483,Unknown,,92490,221958,6.134152,4
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,1,71.283,0.533084,C,85.0,15655423,771155,4.18243,7
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,1,1,7.925,3.280757,Unknown,,90218500,231932,9.327285,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,0,2,26.55,0.659134,C,123.0,2493079,465838,8.660639,7
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,1,1,8.05,4.347826,Unknown,,59517148,359178,0.452187,4


## Data Wrangling for LDA and QDA

In [8]:
print("Dataset shape:", titanic.shape)
print("\nColumn data types:")
print(titanic.dtypes)
print("\nMissing values:")
print(titanic.isnull().sum())
print("\nBasic statistics:")
titanic.describe()

Dataset shape: (891, 26)

Column data types:
PassengerId            int64
Survived               int64
Pclass                 int64
Name                  object
Sex                   object
Age                  float64
SibSp                  int64
Parch                  int64
Ticket                object
Fare                 float64
Cabin                 object
Embarked              object
name_length            int64
title                 object
title_group           object
family_size            int64
is_alone               int64
ticket_group_size      int64
fare_per_person      float64
age_fare_ratio       float64
cabin_deck            object
cabin_room_number    float64
booking_reference      int64
service_id             int64
cabin_score          float64
name_word_count        int64
dtype: object

Missing values:
PassengerId            0
Survived               0
Pclass                 0
Name                   0
Sex                    0
Age                  177
SibSp               

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,name_length,family_size,is_alone,ticket_group_size,fare_per_person,age_fare_ratio,cabin_room_number,booking_reference,service_id,cabin_score,name_word_count
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,200.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,26.965208,1.904602,0.602694,1.787879,17.789001,1.572536,50.49,51081180.0,536369.988777,4.956762,4.06734
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,9.281607,1.613459,0.489615,1.361142,21.218127,1.661773,35.39497,28381740.0,261551.630299,2.915177,1.168866
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,12.0,1.0,0.0,1.0,0.0,0.0,2.0,92490.0,102869.0,0.04632,3.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,20.0,1.0,0.0,1.0,7.7625,0.116026,22.0,28319620.0,299638.0,2.325861,3.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,25.0,1.0,1.0,1.0,8.85,1.175795,43.0,51288530.0,535564.0,4.954913,4.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,30.0,2.0,1.0,2.0,24.288,2.543045,77.25,74931310.0,757663.0,7.479345,4.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,82.0,11.0,1.0,7.0,221.779,9.779559,148.0,99975880.0,999684.0,9.997177,14.0


In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Create a copy for wrangling
df = titanic.copy()

In [10]:
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Remove Cabin columns as there are too many unknowns and it may not add significant value
df.drop(columns=['Cabin', 'cabin_room_number', 'title'], inplace=True)

print("Missing values after imputation:")
print(df.isnull().sum())

Missing values after imputation:
PassengerId          0
Survived             0
Pclass               0
Name                 0
Sex                  0
Age                  0
SibSp                0
Parch                0
Ticket               0
Fare                 0
Embarked             0
name_length          0
title_group          0
family_size          0
is_alone             0
ticket_group_size    0
fare_per_person      0
age_fare_ratio       0
cabin_deck           0
booking_reference    0
service_id           0
cabin_score          0
name_word_count      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [11]:
# Encode categorical variables
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

# Drop PassengerId, Name, and Ticket (not useful for prediction)
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

# One-hot encode Embarked
df_encoded = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# One-hot encode Pclass 
df_encoded = pd.get_dummies(df_encoded, columns=['Pclass'], prefix='class', drop_first=True)

# One-hot encode title_group if you want to use it
df_encoded = pd.get_dummies(df_encoded, columns=['title_group'], prefix='title', drop_first=True)

df_encoded = pd.get_dummies(df_encoded, columns=['cabin_deck'], prefix='cabin', drop_first=True)


print("Data shape after encoding:", df.shape)
print("\nColumn data types after encoding:")
print(df.dtypes)

Data shape after encoding: (891, 20)

Column data types after encoding:
Survived               int64
Pclass                 int64
Sex                    int64
Age                  float64
SibSp                  int64
Parch                  int64
Fare                 float64
Embarked              object
name_length            int64
title_group           object
family_size            int64
is_alone               int64
ticket_group_size      int64
fare_per_person      float64
age_fare_ratio       float64
cabin_deck            object
booking_reference      int64
service_id             int64
cabin_score          float64
name_word_count        int64
dtype: object


In [15]:
# Separate features and target
X = df_encoded.drop(columns=['Survived'])
y = df_encoded['Survived']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nClass distribution:")
print(y.value_counts())

Features shape: (891, 31)
Target shape: (891,)

Class distribution:
Survived
0    549
1    342
Name: count, dtype: int64


In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Test set shape:", X_test_scaled.shape)
print("\nFeature names:")
print(X.columns.tolist())

Training set shape: (668, 31)
Test set shape: (223, 31)

Feature names:
['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'name_length', 'family_size', 'is_alone', 'ticket_group_size', 'fare_per_person', 'age_fare_ratio', 'booking_reference', 'service_id', 'cabin_score', 'name_word_count', 'Embarked_Q', 'Embarked_S', 'class_2', 'class_3', 'title_Miss', 'title_Mr', 'title_Mrs', 'title_Other', 'cabin_B', 'cabin_C', 'cabin_D', 'cabin_E', 'cabin_F', 'cabin_G', 'cabin_T', 'cabin_Unknown']


# LDA and QDA Analysis