In [3]:
import pandas as pd

In [4]:
df_train=pd.read_csv("Titanic_train.csv")
df_test=pd.read_csv("Titanic_test.csv")
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
##for the ease of EDA and preprocessing we are comining the test data and train data
##here the test data do not have survival columns
# so we combine the model without that target column (survival column)

# Separate the target variable from the train set
train_target = df_train['Survived']  ##the col survived is saved here
df_train= df_train.drop(columns=['Survived']) ##dropped that col from train data

# Concatenate train and test datasets 
combined_data = pd.concat([df_train, df_test], axis=0, sort=False)  #combined
print("Combined Data Shape:", combined_data.shape)

Combined Data Shape: (1309, 11)


In [6]:
import matplotlib.pyplot as plt 
import seaborn as sns

num_col= [i for i in combined_data.columns if combined_data[i].dtype!='O']  #categorical cols
cat_col=[i for i in combined_data.columns if combined_data[i].dtype=='O'] #numerical cols

num_col,cat_col

(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'])

## 2. Data Preprocessing:

In [8]:
##filling missing values

combined_data['Age'].fillna(combined_data['Age'].median(),inplace=True) ##filling missing age values with median
combined_data['Fare'].fillna(combined_data['Fare'].median(),inplace=True)##filling tha missed fare value with median


#for cabin col we drop that col because more than 1000 rows have missing values so it cant have good predictive power 
combined_data.drop(columns=['Cabin'], inplace=True)
combined_data.drop(columns=['Embarked'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['Age'].fillna(combined_data['Age'].median(),inplace=True) ##filling missing age values with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['Fare'].fillna(combined_data['Fare'].median(),inplace=True)##filling tha missed fare value with median


In [9]:
##encoding categorical variables
cat_col=[i for i in combined_data.columns if combined_data[i].dtype=='O']
cat_col

['Name', 'Sex', 'Ticket']

In [10]:
##here name,ticket can be also dropped bcz we are not planning to extract anyfeature from them
combined_data.drop(columns=['Name', 'Ticket','PassengerId'], inplace=True)


In [11]:
##encoding
##Use pd.get_dummies() to encode Sex and Embarked since they have a limited number of unique categories.
combined_data = pd.get_dummies(combined_data, columns=['Sex'], drop_first=True)

##converting the boolean values to int (0/1)
combined_data['Sex_male']=combined_data['Sex_male'].astype(int)
combined_data

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22.0,1,0,7.2500,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.9250,0
3,1,35.0,1,0,53.1000,0
4,3,35.0,0,0,8.0500,1
...,...,...,...,...,...,...
413,3,28.0,0,0,8.0500,1
414,1,39.0,0,0,108.9000,0
415,3,38.5,0,0,7.2500,1
416,3,28.0,0,0,8.0500,1


## 3. Model Building:

In [13]:
##splitting the combined data back to test data and train data
x_train = combined_data[:len(df_train)] ##this helps to split the data by using their length
x_test = combined_data[len(df_train):]

x_train['Survived']=train_target## Adding survived back to training data for model taining

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Survived']=train_target## Adding survived back to training data for model taining


In [14]:
##now we use the train data to make model 
##so split the train data to test data and train data then we use that model on the actual test data to get the values

from sklearn.model_selection import train_test_split

y = x_train['Survived'] ##target
x = x_train.drop(columns=['Survived']) ##features
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x, y, test_size=0.1, random_state=42)

##now the x_train is splitted to  (x_train_split, x_test_split, y_train_split, y_test_split)

In [15]:
##creating a model 

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train_split, y_train_split) ##model trained 


## 4. Model Evaluation:

In [17]:
##validation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score,roc_curve
y_pred_split=model.predict(x_test_split) ##predicting the y_test_split using the x_test_split


##comparing the y_pred with y_test_split
print("Accuracy:", accuracy_score(y_test_split, y_pred_split)) 
print("Precision:", precision_score(y_test_split, y_pred_split))
print("recall:", recall_score(y_test_split, y_pred_split))
print("f1:", f1_score(y_test_split, y_pred_split))
print("roc_auc:", roc_auc_score(y_test_split, y_pred_split))
model.feature_names_in_



Accuracy: 0.8444444444444444
Precision: 0.775
recall: 0.8611111111111112
f1: 0.8157894736842105
roc_auc: 0.8472222222222223


array(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male'],
      dtype=object)

In [18]:


y_pred=model.predict(x_test) ##here we are predicting the y_test (i.e survival col) using the x_test using our mode

In [19]:
new_test_data=df_test.copy() 

In [20]:

##printing the test data with our predicted survived column
new_test_data['Predicted survived']=y_pred

In [21]:
import joblib
joblib.dump(model, 'titanic_model.pkl')
print("Model saved as titanic_model.pkl")


Model saved as titanic_model.pkl
