# Decision Tree Classifier

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("C:/Users/Dell/OneDrive/Documents/titanic.csv")

In [5]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Removing Unwanted Field

In [8]:
inputs = df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis="columns")

In [9]:
inputs

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,,23.4500
889,1,1,male,26.0,30.0000


# Check for missing Data

In [10]:
inputs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [11]:
inputs.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

# Data Cleaning 

In [14]:
numeric_columns = inputs.select_dtypes(include=['number']).columns
non_numeric_columns = inputs.select_dtypes(exclude=['number']).columns



In [15]:
from sklearn.impute import SimpleImputer

numeric_imputer = SimpleImputer(strategy='mean')
df_numeric = pd.DataFrame(numeric_imputer.fit_transform(inputs[numeric_columns]), columns=numeric_columns)

non_numeric_imputer = SimpleImputer(strategy='most_frequent')
df_non_numeric = pd.DataFrame(non_numeric_imputer.fit_transform(inputs[non_numeric_columns]), columns=non_numeric_columns)


In [16]:
df_cleaned = pd.concat([df_numeric, df_non_numeric], axis=1)

In [17]:
df_cleaned

Unnamed: 0,Survived,Pclass,Age,Fare,Sex
0,0.0,3.0,22.000000,7.2500,male
1,1.0,1.0,38.000000,71.2833,female
2,1.0,3.0,26.000000,7.9250,female
3,1.0,1.0,35.000000,53.1000,female
4,0.0,3.0,35.000000,8.0500,male
...,...,...,...,...,...
886,0.0,2.0,27.000000,13.0000,male
887,1.0,1.0,19.000000,30.0000,female
888,0.0,3.0,29.699118,23.4500,female
889,1.0,1.0,26.000000,30.0000,male


# Use Label Encoder for encoding non-numeric field

In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

inputs['Sex'] = label_encoder.fit_transform(inputs['Sex'])

# Create target variable for training

In [20]:
X = inputs.drop('Survived', axis=1)
y = inputs['Survived']


In [21]:
X

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,1,22.0,7.2500
1,1,0,38.0,71.2833
2,3,0,26.0,7.9250
3,1,0,35.0,53.1000
4,3,1,35.0,8.0500
...,...,...,...,...
886,2,1,27.0,13.0000
887,1,0,19.0,30.0000
888,3,0,,23.4500
889,1,1,26.0,30.0000


In [22]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

# Training the model usind Decision tree classifier

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)


# Predict accuracy of model using test datset

In [24]:
y_pred = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.7723880597014925


In [25]:
clf.score(X,y)

0.8597081930415263

# Predict accuracy using trained dataset

In [26]:
clf.score(X_train, y_train)

0.8972712680577849

In [29]:
# Example input features: Pclass=3, Age=22, Fare=7.25, Sex=1 (male)
clf.predict([[3, 22, 7.25, 1]])



array([1], dtype=int64)

In [30]:
clf.predict([[1,38,71.28,0]])



array([0], dtype=int64)

# checking cross validation for any means of error while trained

In [31]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
scores = cross_val_score(clf, X, y, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Average cross-validation score: {scores.mean()}')

Cross-validation scores: [0.81005587 0.78089888 0.80898876 0.82022472 0.79775281]
Average cross-validation score: 0.8035842068922229


In [32]:
# Predict on the training data
y_train_pred = clf.predict(X_train)

# Compare predictions with actual values
print(pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred}))

     Actual  Predicted
445       1          1
650       0          0
172       1          1
450       0          0
314       0          0
..      ...        ...
106       1          0
270       0          0
860       0          0
435       1          1
102       0          0

[623 rows x 2 columns]


# Use confusion metric for finding where model gets confused while predicting

In [None]:
# Confusion matrix
cm = confusion_matrix(y_train, y_train_pred)
print('Confusion Matrix:\n', cm)

# Accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on training data: {accuracy}')

# Visualizing confusion metric

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_train and y_train_pred are your true labels and predictions, respectively
cm = confusion_matrix(y_train, y_train_pred)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()