In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px 
from sklearn.impute import SimpleImputer
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
titanic_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
titanic_df['Sex_bin'] = titanic_df.Sex.map({'female':0, 'male':1})
test_df['Sex_bin'] = test_df.Sex.map({'female':0, 'male':1})

In [23]:
titanic_df['Embarked'] = titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0])
titanic_df['Embarked'] = titanic_df.Embarked.map({'S':1, 'C':2, 'Q':3})

0

In [26]:
titanic_df['name_bin'] = titanic_df['Name'].str.contains('\(', na=False).astype(int)
test_df['name_bin'] = test_df['Name'].str.contains('\(', na=False).astype(int)

# Imputer for  missing Age 

In [27]:
imputer = SimpleImputer(strategy='median')
titanic_df['Age'] = imputer.fit_transform(titanic_df[['Age']])
test_df['Age'] = imputer.transform(test_df[['Age']])

In [28]:
titanic_df['cabin_bin']=titanic_df['Cabin'].isna().astype(int)
test_df['cabin_bin']=test_df['Cabin'].isna().astype(int)

In [30]:
titanic_df['Deck'] = titanic_df['Cabin'].str[0].fillna('Unknown')
test_df['Deck'] = test_df['Cabin'].str[0].fillna('Unknown')

In [31]:
input_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked', 'Sex_bin', 'name_bin', 'cabin_bin', 'Deck']
target_col = 'Survived'

In [32]:
numeric_cols = titanic_df[input_cols].select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = titanic_df[input_cols].select_dtypes('object').columns.tolist()

 # Dealing with FARE Column

In [34]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(titanic_df[['Fare']])
test_df['Fare'] = imputer.transform(test_df[['Fare']])

In [39]:
titanic_df['Fare'] = np.log1p(titanic_df['Fare'])
test_df['Fare'] = np.log1p(test_df['Fare'])

# SibSp & Parch

In [45]:
titanic_df['is_alone'] = ((titanic_df['SibSp'] == 0) & (titanic_df['Parch'] == 0)).astype(int)
test_df['is_alone'] = ((test_df['SibSp'] == 0) & (test_df['Parch'] == 0)).astype(int)

In [46]:
titanic_df = titanic_df.drop(columns=['SibSp', 'Parch'])
test_df = test_df.drop(columns=['SibSp', 'Parch'])

In [47]:
input_cols = ['PassengerId','Pclass', 'Age', 'Ticket', 'Fare',
       'Embarked', 'Sex_bin', 'name_bin', 'cabin_bin', 'Deck', 'is_alone']
target_cols = 'Survived'

In [48]:
numeric_cols = titanic_df[input_cols].select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = titanic_df[input_cols].select_dtypes('object').columns.tolist()

In [50]:
numeric_cols

['PassengerId',
 'Pclass',
 'Age',
 'Fare',
 'Sex_bin',
 'name_bin',
 'cabin_bin',
 'is_alone']

# Categorical Columns

In [234]:
titanic_df.groupby('Embarked')['Survived'].mean()

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

In [239]:
titanic_df.groupby('Deck')['Survived'].mean()

Deck
A          0.466667
B          0.733333
C          0.593220
D          0.757576
E          0.750000
F          0.615385
G          0.500000
T          0.000000
Unknown    0.299854
Name: Survived, dtype: float64

In [51]:
from sklearn.preprocessing import OneHotEncoder

In [52]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(titanic_df[['Embarked']])
encoded_cols = encoder.get_feature_names_out(['Embarked'])
encoded_cols

array(['Embarked_C', 'Embarked_Q', 'Embarked_S'], dtype=object)

In [53]:
titanic_df[encoded_cols]=encoder.transform(titanic_df[['Embarked']])
test_df[encoded_cols]=encoder.transform(test_df[['Embarked']])

In [54]:
titanic_df = titanic_df.drop(columns=['Embarked'])
test_df = test_df.drop(columns=['Embarked'])

In [56]:
titanic_df['Deck'].value_counts(normalize=True)

Deck
Unknown    0.771044
C          0.066218
B          0.052750
D          0.037037
E          0.035915
A          0.016835
F          0.014590
G          0.004489
T          0.001122
Name: proportion, dtype: float64

In [248]:
titanic_df.groupby('Deck')['Survived'].mean()

Deck
A          0.466667
B          0.733333
C          0.593220
D          0.757576
E          0.750000
F          0.615385
G          0.500000
T          0.000000
Unknown    0.299854
Name: Survived, dtype: float64

In [57]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(titanic_df[['Deck']])
encoded_cols = encoder.get_feature_names_out(['Deck'])
encoded_cols

array(['Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F',
       'Deck_G', 'Deck_T', 'Deck_Unknown'], dtype=object)

In [251]:
titanic_df[encoded_cols]=encoder.transform(titanic_df[['Deck']])
test_df[encoded_cols]=encoder.transform(test_df[['Deck']])

In [252]:
titanic_df=titanic_df.drop(columns=['Deck'])
test_df=test_df.drop(columns=['Deck'])

In [253]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Ticket,Fare,Sex_bin,name_bin,cabin_bin,is_alone,Embarked_C,...,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,892,3,34.5,330911,2.178064,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,893,3,47.0,363272,2.079442,0,1,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,894,2,62.0,240276,2.369075,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,895,3,27.0,315154,2.268252,1,0,1,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,896,3,22.0,3101298,2.586824,0,1,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [59]:
input_cols = ['Pclass', 'Age','Fare', 'Sex_bin',
       'name_bin', 'cabin_bin', 'is_alone', 'Embarked_C', 'Embarked_Q',
       'Embarked_S']
target_col = 'Survived'

In [58]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Sex_bin,name_bin,cabin_bin,Deck,is_alone,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,2.110213,,1,0,1,Unknown,0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,4.280593,C85,0,1,0,C,0,1.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,2.188856,,0,0,1,Unknown,1,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,3.990834,C123,0,1,0,C,0,0.0,0.0,1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,2.202765,,1,0,1,Unknown,1,0.0,0.0,1.0


In [60]:
encode = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(titanic_df[['Pclass']])
encoded_cols = encoder.get_feature_names_out(['Pclass'])
encoded_cols

array(['Pclass_1', 'Pclass_2', 'Pclass_3'], dtype=object)

In [61]:
titanic_df[encoded_cols] = encoder.transform(titanic_df[['Pclass']])

In [62]:
test_df[encoded_cols] = encoder.transform(test_df[['Pclass']])

In [63]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Sex_bin,name_bin,cabin_bin,Deck,is_alone,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,892,3,"Kelly, Mr. James",male,34.5,330911,2.178064,,1,0,1,Unknown,1,0.0,1.0,0.0,0.0,0.0,1.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,363272,2.079442,,0,1,1,Unknown,0,0.0,0.0,1.0,0.0,0.0,1.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,240276,2.369075,,1,0,1,Unknown,1,0.0,1.0,0.0,0.0,1.0,0.0
3,895,3,"Wirz, Mr. Albert",male,27.0,315154,2.268252,,1,0,1,Unknown,1,0.0,0.0,1.0,0.0,0.0,1.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,3101298,2.586824,,0,1,1,Unknown,0,0.0,0.0,1.0,0.0,0.0,1.0


In [65]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Ticket',
       'Fare', 'Cabin', 'Sex_bin', 'name_bin', 'cabin_bin', 'Deck', 'is_alone',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_1', 'Pclass_2',
       'Pclass_3'],
      dtype='object')

In [66]:
input_cols = ['Age','Fare', 'Sex_bin', 'name_bin', 'cabin_bin','is_alone',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_1', 'Pclass_2',
       'Pclass_3']
target_col = 'Survived'

In [68]:
titanic_df[['Age','Fare', 'Sex_bin', 'name_bin', 'cabin_bin','is_alone', 'Survived']].corr()

Unnamed: 0,Age,Fare,Sex_bin,name_bin,cabin_bin,is_alone,Survived
Age,1.0,0.110964,0.081163,0.164317,-0.240314,0.171647,-0.06491
Fare,0.110964,1.0,-0.263276,0.20632,-0.557192,-0.47841,0.329862
Sex_bin,0.081163,-0.263276,1.0,-0.503102,0.140391,0.303646,-0.543351
name_bin,0.164317,0.20632,-0.503102,1.0,-0.140154,-0.332323,0.346496
cabin_bin,-0.240314,-0.557192,0.140391,-0.140154,1.0,0.158029,-0.316912
is_alone,0.171647,-0.47841,0.303646,-0.332323,0.158029,1.0,-0.203367
Survived,-0.06491,0.329862,-0.543351,0.346496,-0.316912,-0.203367,1.0


In [69]:
titanic_df[['Age','Fare', 'Sex_bin', 'name_bin', 'cabin_bin','is_alone', 'Survived']].skew()

Age          0.510245
Fare         0.394928
Sex_bin     -0.618921
name_bin     1.852971
cabin_bin   -1.292367
is_alone    -0.420431
Survived     0.478523
dtype: float64

In [71]:
px.histogram(titanic_df, x='cabin_bin', color='Survived')

In [77]:
from sklearn.preprocessing import StandardScaler

In [78]:
scaler = StandardScaler()
scaler.fit(titanic_df[['Age','Fare']])

In [86]:
titanic_df[['Age','Fare', 'Survived']].corr()

Unnamed: 0,Age,Fare,Survived
Age,1.0,0.110964,-0.06491
Fare,0.110964,1.0,0.329862
Survived,-0.06491,0.329862,1.0


In [87]:
titanic_df[['Age','Fare']] = scaler.transform(titanic_df[['Age', 'Fare']])
test_df[['Age','Fare']] = scaler.transform(test_df[['Age', 'Fare']])

In [91]:
test_df[['Age','Fare']].skew()

Age     0.618776
Fare    0.864955
dtype: float64

# Train Model

In [92]:
test_id = test_df['PassengerId']
test_id

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [94]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(titanic_df[input_cols], titanic_df[target_col])

In [None]:
print("Accuracy:", accuracy_score(titanic_df[input_cols], titanic_df[target_col]))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [95]:
cv_score = cross_val_score(model, titanic_df[input_cols], titanic_df[target_col], cv=5).mean()
print("Cross-validation accuracy:", cv_score)

Cross-validation accuracy: 0.8080848659845584


In [97]:
test_predict = model.predict(test_df[input_cols])
test_predict

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [98]:
submission = pd.DataFrame({
    'PassengerId': test_id,
    'Survived': test_predict
})
submission.to_csv('/kaggle/working/submission3.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
