# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
#%matplotlib.inline

# Loading the dataset

In this section, we will load the Titanic dataset into the notebook. The dataset is stored in two CSV files, one for the training data and one for the test data. We will use the pandas library to load the CSV files into dataframes that we can manipulate and explore. Once loaded, we can begin to explore the data and prepare it for machine learning modeling

In [27]:
train_data=pd.read_csv('/kaggle/input/titanic/train.csv')
test_data=pd.read_csv("/kaggle/input/titanic/test.csv")

# Data Exploration and Preparation

In [28]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [29]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [30]:
test_data.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [31]:
x=train_data.iloc[:,2:-1]
y=train_data.iloc[:,1].values

In [7]:
x=x.drop(["Name","Ticket","Fare","Cabin"],axis=1)
x.values

array([[3, 'male', 22.0, 1, 0],
       [1, 'female', 38.0, 1, 0],
       [3, 'female', 26.0, 0, 0],
       ...,
       [3, 'female', nan, 1, 2],
       [1, 'male', 26.0, 0, 0],
       [3, 'male', 32.0, 0, 0]], dtype=object)

In [8]:
x.isnull().sum()

Pclass      0
Sex         0
Age       177
SibSp       0
Parch       0
dtype: int64

In [9]:
x["Age"].fillna(x["Age"].mean(),inplace=True)
x.isnull().sum()

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
dtype: int64

In [11]:
from sklearn.preprocessing import OrdinalEncoder
encoder=OrdinalEncoder()
x=encoder.fit_transform(x)

# split train data

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

In [13]:
x_train

array([[ 2.,  1., 37.,  0.,  0.],
       [ 1.,  0., 35.,  0.,  0.],
       [ 2.,  0., 39.,  1.,  0.],
       ...,
       [ 0.,  1., 39.,  0.,  0.],
       [ 2.,  1., 16.,  1.,  0.],
       [ 1.,  1., 49.,  0.,  0.]])

# Modeling

In [14]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()

In [15]:
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [16]:
from sklearn.metrics import accuracy_score 
accuracy=accuracy_score(y_pred,y_test)
accuracy

0.8324022346368715

In [17]:
model.fit(x_test,y_test)


DecisionTreeClassifier()

# Test data preparation

In [32]:
test=test_data.iloc[:,1:-1]


In [33]:
test=test.drop(["Name","Ticket","Fare","Cabin"],axis=1)
test.values

array([[3, 'male', 34.5, 0, 0],
       [3, 'female', 47.0, 1, 0],
       [2, 'male', 62.0, 0, 0],
       ...,
       [3, 'male', 38.5, 0, 0],
       [3, 'male', nan, 0, 0],
       [3, 'male', nan, 1, 1]], dtype=object)

In [34]:
test["Age"].fillna(test["Age"].mean(),inplace=True)
test.isnull().sum()

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
dtype: int64

In [35]:
test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch
count,418.0,418.0,418.0,418.0
mean,2.26555,30.27259,0.447368,0.392344
std,0.841838,12.634534,0.89676,0.981429
min,1.0,0.17,0.0,0.0
25%,1.0,23.0,0.0,0.0
50%,3.0,30.27259,0.0,0.0
75%,3.0,35.75,1.0,0.0
max,3.0,76.0,8.0,9.0


In [36]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Sex     418 non-null    object 
 2   Age     418 non-null    float64
 3   SibSp   418 non-null    int64  
 4   Parch   418 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 16.5+ KB


In [37]:
test = encoder.fit_transform(test)

# Predictions and Submission

In [38]:
result=model.predict(test)
result

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [39]:
result = pd.DataFrame(result, columns=['Survived'])

In [40]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': result.Survived})
output.to_csv('submission.csv', index=False)

In [41]:
a=pd.read_csv('/kaggle/working/submission.csv')
a

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
