In [1]:
# Step 1: Installing & Importing Libraries
import sklearn
import pandas as pd

In [2]:
# Step 2: Importing The Data
data = pd.read_csv("D:/Datasets/Machine Learning/tip.csv")

In [3]:
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# Step 3: Data Cleaning/ Data Pre-Processing
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['Gender_encoded'] = encoder.fit_transform(data['sex'])
data['Smoker_encoded'] = encoder.fit_transform(data['smoker'])
data['Day_encoded'] = encoder.fit_transform(data['day'])
data['Time_encoded'] = encoder.fit_transform(data['time'])

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   total_bill      244 non-null    float64
 1   tip             244 non-null    float64
 2   sex             244 non-null    object 
 3   smoker          244 non-null    object 
 4   day             244 non-null    object 
 5   time            244 non-null    object 
 6   size            244 non-null    int64  
 7   Gender_encoded  244 non-null    int32  
 8   Smoker_encoded  244 non-null    int32  
 9   Day_encoded     244 non-null    int32  
 10  Time_encoded    244 non-null    int32  
dtypes: float64(2), int32(4), int64(1), object(4)
memory usage: 17.3+ KB


In [7]:
data.drop(columns=['sex','smoker','day','time'],inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   total_bill      244 non-null    float64
 1   tip             244 non-null    float64
 2   size            244 non-null    int64  
 3   Gender_encoded  244 non-null    int32  
 4   Smoker_encoded  244 non-null    int32  
 5   Day_encoded     244 non-null    int32  
 6   Time_encoded    244 non-null    int32  
dtypes: float64(2), int32(4), int64(1)
memory usage: 9.7 KB


In [9]:
# Step 4: Splitting The Data
X = data[['total_bill','tip','size','Gender_encoded','Day_encoded','Time_encoded']]
y = data['Smoker_encoded']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.18,random_state=963)

In [10]:
# Step 5: Building The Model
# 5.1: Initializing The Model
from sklearn.tree import DecisionTreeClassifier
DtClassifier = DecisionTreeClassifier()

In [11]:
# 5.2: Training The Model
DtClassifier.fit(X_train,y_train)

In [12]:
# 5.3: Making The Predictions From The Model
y_pred = DtClassifier.predict(X_test)

In [13]:
# 5.4: Testing The Model
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.71      0.61      0.65        28
           1       0.45      0.56      0.50        16

    accuracy                           0.59        44
   macro avg       0.58      0.58      0.58        44
weighted avg       0.61      0.59      0.60        44



In [14]:
print(confusion_matrix(y_pred,y_test))

[[17 11]
 [ 7  9]]


# Comparision Between Logistic And Decision Tree

In [16]:
A = data.drop(columns='Gender_encoded')
b = data['Gender_encoded']
A_train, A_test, b_train, b_test = train_test_split(A,b,test_size=0.15,random_state=852)

In [17]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()
classifier2 = DecisionTreeClassifier()

In [18]:
classifier1.fit(A_train,b_train)

In [19]:
classifier2.fit(A_train,b_train)

In [20]:
b_pred1 = classifier1.predict(A_test)
b_pred2 = classifier2.predict(A_test)

In [21]:
print(classification_report(b_pred1,b_test))

              precision    recall  f1-score   support

           0       0.20      0.40      0.27         5
           1       0.89      0.75      0.81        32

    accuracy                           0.70        37
   macro avg       0.54      0.57      0.54        37
weighted avg       0.80      0.70      0.74        37



In [22]:
print(classification_report(b_pred2,b_test))

              precision    recall  f1-score   support

           0       0.50      0.42      0.45        12
           1       0.74      0.80      0.77        25

    accuracy                           0.68        37
   macro avg       0.62      0.61      0.61        37
weighted avg       0.66      0.68      0.67        37



In [23]:
print(confusion_matrix(b_pred1,b_test))

[[ 2  3]
 [ 8 24]]


In [24]:
print(confusion_matrix(b_pred2,b_test))

[[ 5  7]
 [ 5 20]]
