In [2]:
import pandas as pd
import seaborn as sns

In [3]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.duplicated().sum()

1

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [8]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 243 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  243 non-null    float64 
 1   tip         243 non-null    float64 
 2   sex         243 non-null    category
 3   smoker      243 non-null    category
 4   day         243 non-null    category
 5   time        243 non-null    category
 6   size        243 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 9.1 KB


In [11]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [12]:
x = df.drop(labels=['total_bill'],axis=1)
y = df['total_bill']

In [13]:
x

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.50,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,5.92,Male,No,Sat,Dinner,3
240,2.00,Female,Yes,Sat,Dinner,2
241,2.00,Male,Yes,Sat,Dinner,2
242,1.75,Male,No,Sat,Dinner,2


In [14]:
y

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 243, dtype: float64

In [20]:
numerical_feature = list(x.select_dtypes(exclude='category').columns)
categorical_feature = list(x.select_dtypes(include='category').columns)

In [22]:
numerical_feature

['tip', 'size']

In [23]:
categorical_feature

['sex', 'smoker', 'day', 'time']

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [32]:
num_pipeline = Pipeline(
    steps=(
        [
            ('imputer',SimpleImputer(strategy='median')),
            ('scaler',StandardScaler())
        ]
    )
)
categorical_pipeline = Pipeline(
    steps=([
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('oneHotEncoder',OneHotEncoder())
    ])
)
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_feature),
        ('categorical_pipeline',categorical_pipeline,categorical_feature)
    ]
)

In [33]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)


In [34]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'SVR':SVR()

}

In [36]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import numpy as np

def model_evaluate(x_train,x_test,y_train,y_test,models):
    report = {}

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(x_train,y_train)

        y_pred = model.predict(x_test)

        mse = mean_squared_error(y_test,y_pred)
        mae = mean_absolute_error(y_test,y_pred)
        rmse = np.sqrt(mse)
        score = r2_score(y_test,y_pred)

        report[list(models.keys())[i]] = score
    
    return report





In [39]:
score = model_evaluate(x_train,x_test,y_train,y_test,models)

In [42]:
maxi_score = max(sorted(score.values(),reverse=True))

In [43]:
maxi_score

0.6189621168150647