# Machine Learning

## Decision Tree as Regressor

In [23]:
# Importing Libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [13]:
# Load Dataset
df = sns.load_dataset("tips")
print(df.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [14]:
# Getting the info of our dataset.
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [15]:
# Let's check is there any missing values in our dataset.
print(df.isnull().sum().sort_values(ascending=False))


total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64


In [16]:
# Let's encode the category columns using Label Encoder in for loop.
Le = LabelEncoder()
for col in df.select_dtypes(include=['category', 'object']):
    df[col] = Le.fit_transform(df[col])


In [17]:
# Get the name of columns
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [18]:
# Calculate Q1 and Q3
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define outlier condition
outlier_condition = ~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)

# Remove outliers
df_no_outliers = df[outlier_condition]

print("Original shape:", df.shape)
print("Shape after removing outliers:", df_no_outliers.shape)

Original shape: (244, 7)
Shape after removing outliers: (223, 7)


In [19]:
# Splitting the data 
X = df_no_outliers.drop("tip", axis=1)
y = df_no_outliers["tip"]
# Scaling the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [20]:
# create and train teh model with pred
model = DecisionTreeRegressor(
    random_state=42,
    max_depth=5,           # Limit the depth of the tree
    min_samples_split=4,   # Minimum samples required to split an internal node
    min_samples_leaf=2     # Minimum samples required to be at a leaf node
)
model.fit(X_train, y_train)

# predict the model
y_pred = model.predict(X_test)

In [21]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)

Mean Squared Error: 0.6279965192964221
Mean Absolute Error: 0.6236671625947489
R2 Score: 0.3362721767975423


In [22]:
# Create a DataFrame to compare actual and predicted values
results = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})

print(results.head(10))

   Actual  Predicted
0    3.23   2.493333
1    4.00   2.746552
2    1.68   2.493333
3    2.74   2.746552
4    2.00   3.464375
5    1.50   2.063636
6    3.00   2.746552
7    3.76   2.746552
8    2.02   1.676667
9    2.00   2.005000


# Decision Tree as Classifier

In [102]:
# Loading Data set again
df = sns.load_dataset("tips")
print(df.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [103]:
# Getting info of our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [104]:
# Let's encode the category columns using Label Encoder in for loop.
Le = LabelEncoder()
for col in df.select_dtypes(include=['category', 'object']):
    df[col] = Le.fit_transform(df[col])


In [105]:
# Calculate Q1 and Q3
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define outlier condition
outlier_condition = ~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)

# Remove outliers
df_no_outliers = df[outlier_condition]

print("Original shape:", df.shape)
print("Shape after removing outliers:", df_no_outliers.shape)

Original shape: (244, 7)
Shape after removing outliers: (223, 7)


In [106]:
# Splitting the dataset into X and y
X = df_no_outliers.drop("smoker",axis=1)
y = df_no_outliers["smoker"]
# Split the dataset into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# create and train the model with pred
model = DecisionTreeClassifier(criterion="entropy",random_state=42)
model.fit(X_train, y_train)

# predict the model
y_pred = model.predict(X_test)


In [107]:
# Let's Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6666666666666666
Precision: 0.6638655462184874
Recall: 0.6666666666666666
F1 Score: 0.6649350649350649
Confusion Matrix:
 [[20  7]
 [ 8 10]]


In [108]:
# Create a DataFrame to compare actual and predicted values
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

print(results.head(10))

     Actual  Predicted
9         0          0
87        0          1
121       0          0
152       0          0
241       1          0
117       0          0
71        0          0
108       0          0
193       1          0
202       1          1
