Titanic Survival Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load Data
df = pd.read_csv("/content/drive/MyDrive/Week3/Decision_tree/Titanic-Dataset.csv")
df = df[['Survived','Pclass','Sex','Age','SibSp','Fare']]  # select useful columns
df['Age'] = df['Age'].fillna(df['Age'].mean())             # fill missing values
df['Sex'] = df['Sex'].map({'male':0,'female':1})           # encode category

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,0,22.0,1,7.25
1,1,1,1,38.0,1,71.2833
2,1,3,1,26.0,0,7.925
3,1,1,1,35.0,1,53.1
4,0,3,0,35.0,0,8.05


In [4]:
X = df.drop('Survived',axis=1)
y = df['Survived']

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [6]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [7]:
print("Titanic Accuracy:", accuracy_score(y_test,y_pred))

Titanic Accuracy: 0.7597765363128491


In [8]:
# User Input Prediction Example
sample_input = [[3,0,25,1,12.0]]    # Pclass,Sex,Age,SibSp,Fare
print("Sample Prediction (1=Survived, 0=Not Survived):", model.predict(sample_input))

Sample Prediction (1=Survived, 0=Not Survived): [1]




Iris Flower Dataset

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [10]:
#  Load dataset
df = pd.read_csv("/content/drive/MyDrive/Week3/Decision_tree/IRIS.csv")
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [11]:
# Separate features and target
X = df[['sepal_length','sepal_width','petal_length','petal_width']]
y = df['species']

In [12]:
# Convert target (species) to numbers
y = y.map({
    "Iris-setosa":0,
    "Iris-versicolor":1,
    "Iris-virginica":2
})

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [14]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

In [15]:
# Evaluate
y_pred = model.predict(X_test)
print("Iris Accuracy:", accuracy_score(y_test,y_pred))

Iris Accuracy: 1.0


In [16]:
# User Input Prediction Example
# Format: [sepal_length, sepal_width, petal_length, petal_width]
sample_input = [[5.2,3.4,1.5,0.2]]
pred = model.predict(sample_input)



In [17]:
target_names = ["Iris-setosa","Iris-versicolor","Iris-virginica"]
print("Prediction for sample:", target_names[pred[0]])

Prediction for sample: Iris-setosa


Adult Census Income Dataset

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [19]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Week3/Decision_tree/adult.csv")

In [20]:
# Encode categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].apply(LabelEncoder().fit_transform)

In [21]:
# Features & target
X = df.drop('income',axis=1)
y = df['income']

In [22]:
# Train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
# Model
model = DecisionTreeClassifier(max_depth=6)
model.fit(X_train,y_train)

In [24]:
# Evaluate
y_pred = model.predict(X_test)
print("Adult Income Accuracy:", accuracy_score(y_test,y_pred))

Adult Income Accuracy: 0.8539843390142792


In [25]:
# User Input Example – Age, Workclass, Education, Hours/week, etc.
sample = [X.iloc[0].tolist()]   # using first row values
print("Sample Prediction (1:Earns >50K, 0:<=50K):", model.predict(sample))


Sample Prediction (1:Earns >50K, 0:<=50K): [1]




Breast Cancer Wiscoin

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [27]:
# 1) Load dataset
df = pd.read_csv("/content/drive/MyDrive/Week3/Decision_tree/data.csv")
print(df.head())

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [28]:
# 2) Remove unnecessary columns
if 'id' in df.columns:
    df = df.drop(columns=['id'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [29]:
#Encode target
df['diagnosis'] = df['diagnosis'].map({'M':1,'B':0})

In [30]:
# 3) Handle missing values (if any)
df = df.fillna(df.mean())

In [31]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [32]:
#  Separate features & target
X = df.drop('diagnosis',axis=1)
y = df['diagnosis']

In [33]:
# Train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [34]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

In [35]:
y_pred = model.predict(X_test)
print("Breast Cancer Decision Tree Accuracy:", accuracy_score(y_test,y_pred))

Breast Cancer Decision Tree Accuracy: 0.9385964912280702


In [36]:
sample = [X_test.iloc[0]]
pred = model.predict(sample)
print("Sample Prediction:", "Malignant" if pred[0]==1 else "Benign")

Sample Prediction: Benign




Heart Disease Dataset

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [38]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Week3/Decision_tree/heart.csv")

In [39]:
X = df.drop('target',axis=1)
y = df['target']

In [40]:
# Split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [41]:
# Train model
model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train,y_train)

In [42]:
# Evaluate
y_pred = model.predict(X_test)
print("Heart Disease Accuracy:", accuracy_score(y_test,y_pred))

Heart Disease Accuracy: 0.8439024390243902


In [43]:
# Predict user input example
sample = [[63,1,3,145,233,1,0,150,0,2.3,0,0,1]]   # typical row format
print("Prediction (1=Heart Disease Present, 0=No Disease):", model.predict(sample))

Prediction (1=Heart Disease Present, 0=No Disease): [1]


