In [86]:
import pandas as pd


train_data = pd.read_csv(r"D:\Sharif University of Tech\Data\Kaggle\Data set\Titanic ML from disaster\train.csv")
test_data = pd.read_csv(r"D:\Sharif University of Tech\Data\Kaggle\Data set\Titanic ML from disaster\test.csv")

In [87]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Let's handel the null data :

In [88]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

For age it seems logical to use the median age :

In [89]:
from sklearn.impute import SimpleImputer

imputer_age = SimpleImputer(strategy='median')
train_data['Age'] = imputer_age.fit_transform(train_data[['Age']])
test_data['Age'] = imputer_age.fit_transform(test_data[['Age']])
test_data['Fare'] = imputer_age.fit_transform(test_data[['Fare']])

In [90]:
imputer_embarked = SimpleImputer(strategy='most_frequent')
train_data['Embarked'] = imputer_embarked.fit_transform(train_data[['Embarked']]).ravel()
test_data['Embarked'] = imputer_embarked.fit_transform(test_data[['Embarked']]).ravel()

In [91]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

seeing as how we don't have most of the cabin data (usefull as it could be), we simply cannot use it.

In [92]:
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

Now to encode the non numerical data :

In [93]:
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

In [94]:
embarked_dummies = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
train_data = pd.concat([train_data, embarked_dummies], axis=1)
train_data.drop('Embarked', axis=1, inplace=True)


embarked_dummies = pd.get_dummies(test_data['Embarked'], prefix='Embarked')
test_data = pd.concat([test_data, embarked_dummies], axis=1)
test_data.drop('Embarked', axis=1, inplace=True)

In [95]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data['Fare'] = scaler.fit_transform(train_data['Fare'].values.reshape(-1, 1))
test_data['Fare'] = scaler.fit_transform(test_data['Fare'].values.reshape(-1, 1))

In [96]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_C,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,-0.497413,False,True,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,-0.512278,False,False,True
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,-0.464100,False,True,False
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,-0.482475,False,False,True
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,-0.417492,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",0,27.0,0,0,A.5. 3236,-0.493455,False,False,True
414,1306,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,1.314435,True,False,False
415,1307,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,-0.507796,False,False,True
416,1308,3,"Ware, Mr. Frederick",0,27.0,0,0,359309,-0.493455,False,False,True


In [97]:
train_data = train_data.drop(columns=["Name", "Ticket"])
test_data = test_data.drop(columns=["Name", "Ticket"])

Let's start implementing a model :

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [100]:
predictions = model.predict(X_val)

In [101]:
accuracy = accuracy_score(y_val, predictions)

In [102]:
accuracy

0.8212290502793296

In [103]:
test_data.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [104]:
test_predictions = model.predict(test_data)

submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions
})

submission.to_csv(r'D:\Sharif University of Tech\Data\Kaggle\Data set\Titanic ML from disaster\titanic_submission.csv', index=False)