In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.preprocessing import StandardScaler,LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import r2_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
df = pd.read_csv('/content/drive/MyDrive/Datasets/tips.csv')
df.head()

In [None]:
#df = pd.read_csv('tips.csv')
#df.head()

In [None]:
cat = []
num = []
for i in df.columns:
    if df[i].dtypes=='object':
        cat.append(i)
    else:
        num.append(i)      
print('The categorical variables are:\n',cat,'\n')
print('The numerical variables are:\n',num)
The categorical variables are:
 ['sex', 'smoker', 'day', 'time'] 

The numerical variables are:
 ['total_bill', 'tip', 'size']

In [None]:
df.groupby('day')['total_bill'].mean()


In [None]:
df.groupby('sex')['tip'].mean()


In [None]:
df.groupby('time')['size'].count()


In [None]:
df['tip'].describe()


In [None]:
df.day.mode()


In [None]:
sns.distplot(df['total_bill'])
plt.show()
df['total_bill'].skew()


In [None]:
df[['tip','total_bill']].corr()


In [None]:
sns.scatterplot(x='total_bill',y='tip',data=df)
plt.show()

In [None]:
df['sex'].value_counts(normalize=True).plot(kind='pie',autopct='%.2f%%')
plt.show()
df.groupby('sex')['smoker'].value_counts()


In [None]:
pd.crosstab(df['smoker'],df['sex']).plot(kind='bar')
plt.show()

In [None]:
df.groupby('day')['tip'].mean()


In [None]:
sns.barplot(x='day',y='tip',data=df)
plt.show()

In [None]:
df.groupby('size')['total_bill'].mean().plot(kind='bar')
plt.show()

In [None]:
sns.boxplot(x='total_bill',y='sex',data=df)
plt.show()

In [None]:
df.groupby(['time','day'])['total_bill'].max()


In [None]:
df.isnull().sum()/len(df)*100


In [None]:
len(df[df.duplicated()])


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
len(df[df.duplicated()])


In [None]:
## boxplot before treatment
sns.boxplot(df['total_bill'])
plt.show()

In [None]:
## treating outliers using log transformation
df['total_bill_trans'] = np.log(df['total_bill'])

## boxplot after transformation
sns.boxplot(df['total_bill_trans'])
plt.show()

In [None]:
sns.boxplot(df['tip'])
plt.show()


In [None]:
# Using IQR method
Q1 = df['tip'].quantile(0.25)
Q3 = df['tip'].quantile(0.75)
IQR = Q3-Q1

lower_whisker  = Q1-(1.5*IQR)
upper_whisker  = Q3+(1.5*IQR)


In [None]:
df_out = df.loc[(df['tip'] < upper_whisker) & (df['tip'] > lower_whisker)] # rows without outliers


In [None]:
sns.boxplot(df_out['tip'])
plt.show()


In [None]:
df = pd.get_dummies(df,drop_first=True)
df.sample(5)


In [None]:
tb_max = df['total_bill'].max()
tb_min = df['total_bill'].min()
range_ = tb_max-tb_min
print(range_)


In [None]:
## initialize minmaxscalar
mm = MinMaxScaler()


In [None]:
## Normalizing the values of the total_bill, so that the range will be 1.
df['total_bill_mm'] = mm.fit_transform(df[['total_bill']])


In [None]:
## checking the range after normalization
tb_mm_max = df['total_bill_mm'].max()
tb_mm_min = df['total_bill_mm'].min()
range_ = tb_mm_max-tb_mm_min
print(range_)


In [None]:
## Loading the dataset again as 'tips_df'
tips_df = pd.read_csv('/content/drive/MyDrive/Datasets/tips.csv')
tips_df.head(2)

In [None]:
## Encoding categorical variables
tips_df = pd.get_dummies(tips_df,drop_first=True)
tips_df.head(2)


In [None]:
## Storing the target column in Y variable and the rest of the columns in the X variable.
X = tips_df.drop('tip',axis=1)
y = tips_df['tip']


In [None]:
## Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

## Scaling the data using min max scaling
mm = MinMaxScaler()

X_train.iloc[:,:2] = mm.fit_transform(X_train.iloc[:,:2])
X_test.iloc[:,:2] = mm.transform(X_test.iloc[:,:2])


In [None]:
## Fitting a linear regression model on the train data
lr = LinearRegression()
lr.fit(X_train,y_train)


In [None]:
## Making predictions on the test data
pred = lr.predict(X_test)


In [None]:
## Computing r2_score
print('r2-score test:', r2_score(y_test,pred))
