In [4]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('tips.csv')
# df.head()

In [None]:
# df.shape

In [None]:
# df.info()

In [None]:
# df.describe().T

In [None]:
df.isnull().sum()
# Check for null values in dataset.

In [None]:
plt.subplots(figsize=(15,8))

for i, col in enumerate(['total_bill', 'tip']):
  plt.subplot(2,3, i + 1)
  sb.distplot(df[col])
plt.tight_layout()
plt.show()
# Shows positive skew in the data. This means the Median is above the Mean (i.e. some large tips are skewing the average higher.)

In [None]:
plt.subplots(figsize=(15,8))

for i, col in enumerate(['total_bill', 'tip']):
  plt.subplot(2,3, i + 1)
  sb.boxplot(df[col])
plt.tight_layout()
plt.show()
# This boxplot shows outliers in the data. 

In [None]:
df.shape, df[(df['total_bill']<45) & (df['tip']<7)].shape
# Removing only 7 outliers from the dataset reduces the skew to not be so right-skewed.

In [13]:
df = df[(df['total_bill']<45) & (df['tip']<7)]
#remove outliers

In [None]:
feat = df.loc[:,'sex':'size'].columns

plt.subplots(figsize=(15,8))
for i, col in enumerate(feat):
  plt.subplot(2,3, i + 1)
  sb.countplot(df[col])
plt.tight_layout()
plt.show()
# Conclusions from this data:
# - Footfall on weekends is more than that on weekdays
# - People usually prefer dinner outside as compared to lunch.
# - People going alone to restaurants is as rare as people going with a family of 5 or 6 persons.

In [None]:
plt.scatter(df['total_bill'], df['tip'])
plt.title('Total Bill v/s Total Tip')
plt.xlabel('Total Bill')
plt.ylabel('Total Tip')
plt.show()
# Relation btw family size and tip given

In [None]:
# set numeric_only parameter to True to avoid calculating the mean of non-numeric columns
df.groupby(['size']).mean(numeric_only=True)

In [None]:
df.groupby(['time']).mean(numeric_only=True)

In [None]:
df.groupby(['day']).mean(numeric_only=True)

In [None]:
le = LabelEncoder()

for col in df.columns:
  if df[col].dtype == object:
    df[col] = le.fit_transform(df[col])

df.head()

In [None]:
# Heatmap can be used to determine variable correlations.
plt.figure(figsize=(7,7))
sb.heatmap(df.corr() > 0.7, annot = True, cbar = False)
plt.show()

In [None]:
# Using model to determine tip size
features = df.drop('tip', axis=1)
target = df['tip']

#Split data into training and testing sets.
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.2, random_state=22)
X_train.shape, X_val.shape

In [22]:
# Achieve stable and fast training of the model.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
# Training & Testing
models = [LinearRegression(), XGBRegressor(), RandomForestRegressor(), AdaBoostRegressor()]

for i in range(4):
  models[i].fit(X_train, Y_train)

  print(f'{models[i]} : ')
  pred_train = models[i].predict(X_train)
  print('Training Accuracy : ', mae(Y_train, pred_train))

  pred_val = models[i].predict(X_val)
  print('Validation Accuracy : ', mae(Y_val, pred_val))
  print()
  # Random Forrest Regression gives the lowest mean error in this situation.