**Dataset:**
https://github.com/mwaskom/seaborn-data/blob/master/taxis.csv

**Problem Statement:** Using the taxis dataset, develop a logistic regression model to predict whether a taxi trip included a tip - i.e., whether tip > 0.



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df_taxi = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/taxis.csv")

In [None]:
df_taxi.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [None]:
df_taxi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pickup           6433 non-null   object 
 1   dropoff          6433 non-null   object 
 2   passengers       6433 non-null   int64  
 3   distance         6433 non-null   float64
 4   fare             6433 non-null   float64
 5   tip              6433 non-null   float64
 6   tolls            6433 non-null   float64
 7   total            6433 non-null   float64
 8   color            6433 non-null   object 
 9   payment          6389 non-null   object 
 10  pickup_zone      6407 non-null   object 
 11  dropoff_zone     6388 non-null   object 
 12  pickup_borough   6407 non-null   object 
 13  dropoff_borough  6388 non-null   object 
dtypes: float64(5), int64(1), object(8)
memory usage: 703.7+ KB


Steps that I consider for Data-Cleaning :
1) Drop irrelevent columns. ['tolls' column]
2) Insert relevent columns. ['is tipped' column]
3) Manage Null records. ['mode' values]
4) Non-numeric Data types to Numeric. [datatype- object to boolean]

In [None]:
## The ultimate target(Title) of the Analysis.
df_taxi['is_tipped'] = df_taxi['tip'] > 0

In [None]:
df_taxi.drop('tolls', axis=1, inplace=True)

In [None]:
df_taxi.drop(['dropoff_zone', 'pickup_borough'],axis=1,inplace=True)

In [None]:
df_taxi.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,total,color,payment,pickup_zone,dropoff_borough,is_tipped
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,12.95,yellow,credit card,Lenox Hill West,Manhattan,True
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,9.3,yellow,cash,Upper West Side South,Manhattan,False
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,14.16,yellow,credit card,Alphabet City,Manhattan,True
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,36.95,yellow,credit card,Hudson Sq,Manhattan,True
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,13.4,yellow,credit card,Midtown East,Manhattan,True


In [None]:
#df_taxi['yellow'].value_counts()

In [None]:
yellow = pd.get_dummies(df_taxi['color'], drop_first=True)

df_taxi.drop('color', axis=1, inplace=True)

df_taxi = pd.concat([df_taxi, yellow], axis=1)

display(df_taxi.head())

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,total,payment,pickup_zone,dropoff_borough,is_tipped,yellow
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,12.95,credit card,Lenox Hill West,Manhattan,True,True
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,9.3,cash,Upper West Side South,Manhattan,False,True
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,14.16,credit card,Alphabet City,Manhattan,True,True
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,36.95,credit card,Hudson Sq,Manhattan,True,True
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,13.4,credit card,Midtown East,Manhattan,True,True


In [None]:
#df_taxi['payment'].value_counts()

In [None]:
#df_taxi['payment'].isnull().sum()

In [None]:
cash = pd.get_dummies(df_taxi['payment'], drop_first=True)

df_taxi.drop('payment', axis=1, inplace=True)

df_taxi = pd.concat([df_taxi, cash], axis=1)

display(df_taxi.head())

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,total,pickup_zone,dropoff_borough,is_tipped,yellow,credit card
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,12.95,Lenox Hill West,Manhattan,True,True,True
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,9.3,Upper West Side South,Manhattan,False,True,False
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,14.16,Alphabet City,Manhattan,True,True,True
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,36.95,Hudson Sq,Manhattan,True,True,True
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,13.4,Midtown East,Manhattan,True,True,True


In [None]:
#df_taxi['pickup_zone'].value_counts()

In [None]:
#df_taxi['dropoff_borough'].value_counts()

In [None]:
# Check for null values in each column
null_counts = df_taxi.isnull().sum()
print("Null values in each column:")
display(null_counts[null_counts > 0])

Null values in each column:


Unnamed: 0,0
pickup_zone,26
dropoff_borough,45


In [None]:
# Impute missing values in specified columns with the mode
cols_with_nulls = ['pickup_zone','dropoff_borough']

for col in cols_with_nulls:
    mode_val = df_taxi[col].mode()[0]
    df_taxi[col].fillna(mode_val, inplace=True)

# Verify that there are no more null values in these columns
print("Null values after imputation:")
display(df_taxi[cols_with_nulls].isnull().count())

Null values after imputation:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_taxi[col].fillna(mode_val, inplace=True)


Unnamed: 0,0
pickup_zone,6433
dropoff_borough,6433


In [None]:
df_taxi.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,total,pickup_zone,dropoff_borough,is_tipped,yellow,credit card
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,12.95,Lenox Hill West,Manhattan,True,True,True
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,9.3,Upper West Side South,Manhattan,False,True,False
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,14.16,Alphabet City,Manhattan,True,True,True
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,36.95,Hudson Sq,Manhattan,True,True,True
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,13.4,Midtown East,Manhattan,True,True,True


In [None]:
## Convert 'pickup' and 'dropoff' to datetime objects
df_taxi['pickup'] = pd.to_datetime(df_taxi['pickup'])
df_taxi['dropoff'] = pd.to_datetime(df_taxi['dropoff'])

## Extract features from datetime columns

df_taxi['pickup_weeks'] = df_taxi['pickup'].dt.dayofweek
df_taxi['pickup_hours'] = df_taxi['pickup'].dt.hour

df_taxi['dropoff_weeks'] = df_taxi['dropoff'].dt.dayofweek
df_taxi['dropoff_hours'] = df_taxi['dropoff'].dt.hour

## Drop original datetime columns
df_taxi = df_taxi.drop(['pickup','dropoff'], axis=1)

In [None]:
df_taxi.head()

Unnamed: 0,passengers,distance,fare,tip,total,pickup_zone,dropoff_borough,is_tipped,yellow,credit card,pickup_weeks,pickup_hours,dropoff_weeks,dropoff_hours
0,1,1.6,7.0,2.15,12.95,Lenox Hill West,Manhattan,True,True,True,5,20,5,20
1,1,0.79,5.0,0.0,9.3,Upper West Side South,Manhattan,False,True,False,0,16,0,16
2,1,1.37,7.5,2.36,14.16,Alphabet City,Manhattan,True,True,True,2,17,2,18
3,1,7.7,27.0,6.15,36.95,Hudson Sq,Manhattan,True,True,True,6,1,6,1
4,3,2.16,9.0,1.1,13.4,Midtown East,Manhattan,True,True,True,5,13,5,13


In [None]:
df_taxi.drop(['pickup_zone', 'dropoff_borough'],axis=1,inplace=True)
df_taxi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   passengers     6433 non-null   int64  
 1   distance       6433 non-null   float64
 2   fare           6433 non-null   float64
 3   tip            6433 non-null   float64
 4   total          6433 non-null   float64
 5   is_tipped      6433 non-null   bool   
 6   yellow         6433 non-null   bool   
 7   credit card    6433 non-null   bool   
 8   pickup_weeks   6433 non-null   int32  
 9   pickup_hours   6433 non-null   int32  
 10  dropoff_weeks  6433 non-null   int32  
 11  dropoff_hours  6433 non-null   int32  
dtypes: bool(3), float64(4), int32(4), int64(1)
memory usage: 370.8 KB


In [None]:
# Filter the correlation matrix to show correlations greater than 0.8 and display only True values
display(df_taxi.corr()[df_taxi.corr() > 0.8])

Unnamed: 0,passengers,distance,fare,tip,total,is_tipped,yellow,credit card,pickup_weeks,pickup_hours,dropoff_weeks,dropoff_hours
passengers,1.0,,,,,,,,,,,
distance,,1.0,0.920108,,0.904676,,,,,,,
fare,,0.920108,1.0,,0.974358,,,,,,,
tip,,,,1.0,,,,,,,,
total,,0.904676,0.974358,,1.0,,,,,,,
is_tipped,,,,,,1.0,,0.850457,,,,
yellow,,,,,,,1.0,,,,,
credit card,,,,,,0.850457,,1.0,,,,
pickup_weeks,,,,,,,,,1.0,,0.995235,
pickup_hours,,,,,,,,,,1.0,,0.941932


Analysis steps

In [None]:
from sklearn.model_selection import train_test_split # split data
from sklearn.linear_model import LogisticRegression  # Algorithm class
from sklearn.metrics import accuracy_score,confusion_matrix # Evaluation methods

In [None]:
x = df_taxi.drop('is_tipped',axis=1)
y = df_taxi['is_tipped']

In [None]:
 x.shape  # Table-Dimensions  >>  #(Rows,Columns) >> (output)

(6433, 11)

In [None]:
y.shape

(6433,)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.25)

In [None]:
# create object
logr = LogisticRegression()

# train the model
logr.fit(x_train,y_train)

# predict values
y_pred = logr.predict(x_test)

In [None]:
accuracy_score(y_pred,y_test)*100  #(in percent)

99.68924798011187

In [None]:
confusion_matrix(y_pred,y_test)

array([[ 566,    5],
       [   0, 1038]])