In [24]:
import seaborn as sns 

In [25]:
df=sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [26]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [27]:
df["day"].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [28]:
df["time"].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [29]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [30]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [31]:
X=df.drop(labels=["total_bill"],axis=1)
y=df.total_bill

In [32]:
X,y

(      tip     sex smoker   day    time  size
 0    1.01  Female     No   Sun  Dinner     2
 1    1.66    Male     No   Sun  Dinner     3
 2    3.50    Male     No   Sun  Dinner     3
 3    3.31    Male     No   Sun  Dinner     2
 4    3.61  Female     No   Sun  Dinner     4
 ..    ...     ...    ...   ...     ...   ...
 239  5.92    Male     No   Sat  Dinner     3
 240  2.00  Female    Yes   Sat  Dinner     2
 241  2.00    Male    Yes   Sat  Dinner     2
 242  1.75    Male     No   Sat  Dinner     2
 243  3.00  Female     No  Thur  Dinner     2
 
 [244 rows x 6 columns],
 0      16.99
 1      10.34
 2      21.01
 3      23.68
 4      24.59
        ...  
 239    29.03
 240    27.18
 241    22.67
 242    17.82
 243    18.78
 Name: total_bill, Length: 244, dtype: float64)

In [33]:
## Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [35]:
categorical_cols=["sex","smoker","day","time"]
numerical_cols=["tip","size"]
print(categorical_cols)

['sex', 'smoker', 'day', 'time']


In [36]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("onehotencoder",OneHotEncoder())
    ]
)

In [37]:
preprocessing=ColumnTransformer( [
    ("cat_pipeline",cat_pipeline,categorical_cols),
    ("num_pipeline",num_pipeline,numerical_cols)
])
                                

In [38]:
X_train=preprocessing.fit_transform(X_train)
X_test=preprocessing.transform(X_test)

In [39]:
X_train

array([[ 0.        ,  1.        ,  1.        , ...,  0.        ,
        -0.2580329 , -0.61214068],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -0.74211442, -0.61214068],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.6399734 , -0.61214068],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -1.46472887, -0.61214068],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.32426806, -0.61214068],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.41237773,  0.45363997]], shape=(195, 12))

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [47]:
models={
    "Random Forest":RandomForestRegressor(),
    "Linear Regression":LinearRegression(),
    "DecisionTree":DecisionTreeRegressor()
}

In [48]:
from sklearn.metrics import r2_score

In [49]:
def evaluate_model(X_train, y_train, X_test, y_test, models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]   # Get model
        model.fit(X_train, y_train)         # Train model
        y_test_pred = model.predict(X_test) # Predict
        test_model_score = r2_score(y_test, y_test_pred) # Accuracy
        report[list(models.keys())[i]] = test_model_score       # Store result
    return report

In [50]:
evaluate_model(X_train, y_train, X_test, y_test, models)

{'Random Forest': 0.5019912956851675,
 'Linear Regression': 0.6240808714290967,
 'DecisionTree': 0.4831024538276847}