In [3]:
import numpy as np 
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [5]:
tips_df = sns.load_dataset("tips")
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
X = tips_df.drop(['tip'],axis=1)
y = tips_df['tip']

In [7]:
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [8]:
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

In [9]:
neumerical = X.drop(['sex','smoker','day','time'],axis=1)

In [10]:
neumerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [11]:
categorical = X.filter(['sex','smoker','day','time'],axis=1)

In [12]:
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [13]:
cat_neumerical = pd.get_dummies(categorical,drop_first=True)
cat_neumerical

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1
...,...,...,...,...,...,...
239,0,1,0,1,0,1
240,1,0,0,1,0,1
241,0,0,0,1,0,1
242,0,1,0,1,0,1


In [14]:
X = pd.concat([neumerical,cat_neumerical],axis=1)
X

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...
239,29.03,3,0,1,0,1,0,1
240,27.18,2,1,0,0,1,0,1
241,22.67,2,0,0,0,1,0,1
242,17.82,2,0,1,0,1,0,1


In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

regressor = lin_reg.fit(X_train,y_train)


In [19]:
y_pred = regressor.predict(X_test)

In [22]:
from sklearn import metrics

metrics.mean_absolute_error(y_test,y_pred),np.sqrt(metrics.mean_squared_error(y_test,y_pred))

(0.7418702008220062, 0.9636287548943023)

## KNN Regression
KNN stands for K-nearest neighbors. KNN is a lazy learning algorithm, which is based on finding the Euclidean distance between different data points
### Why Use KNN Algorithm? 
KNN algorithm is particularly useful because: 
1. KNN Algorithm doesn’t assume any relationship between the features. 
2. Useful for a dataset where data localization is important. 
3. Only have to tune the parameter K, which is the number of nearest neighbors. 
4. No training is needed, as it is a lazy learning algorithm. 
5. Recommender systems and finding semantic similarity between the documents are major applications of the KNN algorithm. 
### Disadvantages of KNN Algorithm Following are the disadvantages of the KNN algorithm: 
1. You have to find the optimal value for K, which is not easy. 
2. Not suitable for very high dimensional data

In [31]:
X_train.ndim,y_train.ndim,X_test.ndim,y_test.ndim

(2, 1, 2, 1)

In [35]:
type(X_train),X_train.shape,X_train.ndim,X_train

(numpy.ndarray,
 (183, 8),
 2,
 array([[-0.46952477, -0.62393978, -0.70710678, ...,  1.31580518,
         -0.65550553,  0.63003696],
        [ 0.12508676, -0.62393978, -0.70710678, ..., -0.75999093,
         -0.65550553, -1.58720848],
        [ 0.55187335,  1.55093602,  1.41421356, ..., -0.75999093,
          1.52554014,  0.63003696],
        ...,
        [-1.04301925, -0.62393978,  1.41421356, ..., -0.75999093,
         -0.65550553, -1.58720848],
        [ 1.37432668,  1.55093602, -0.70710678, ..., -0.75999093,
          1.52554014,  0.63003696],
        [-1.42090322, -0.62393978, -0.70710678, ..., -0.75999093,
          1.52554014,  0.63003696]]))

In [36]:
from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor(n_neighbors = 5)

regressor = knn_reg.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

from sklearn import metrics

metrics.mean_absolute_error(y_test,y_pred),metrics.mean_squared_error(y_test,y_pred),np.sqrt(metrics.mean_squared_error(y_test,y_pred))

(0.7945901639344262, 0.9892809836065573, 0.9946260521455073)

# Random Forest Regressor
## Why Use Random Forest Algorithm? 
Random forest algorithm is particularly useful when: 
1. You have lots of missing data or an imbalanced dataset. 
2. With a large number of trees, you can avoid overfitting while training. Overfitting occurs when machine learning models perform better on the training set but worse on the test set. 
3. Random forest algorithms can be used when you have very higher-dimensional data. 
4. Through cross-validation, the random forest can return higher accuracy. 
5. The random forest algorithm can solve both classification and regression tasks and finds its application in a variety of tasks ranging from credit card fraud detection, stock market prediction, and finding fraudulent online transactions. 
## Disadvantages of Random Forest Algorithm 
There are two major disadvantages of Random forest algorithms: 
1. Using a large number of trees can slow down the algorithm. 
2. Random forest algorithm is a predictive algorithm, which can only predict the future but cannot explain what happened in the past using the dataset.

In [41]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=42,n_estimators=500)

regressor = rfr.fit(X_train,y_train)

single_reccord = scaler.transform(X.values[100].reshape(1,-1))

predicted_tip = regressor.predict(single_reccord)





In [42]:
predicted_tip

array([2.252])

In [43]:
# Multioutput Regression

from sklearn.datasets import make_regression
x,y = make_regression(n_samples=2000,n_features=8,n_informative=4,n_targets=3,random_state=42,noise=0.3)
x.shape,y.shape

((2000, 8), (2000, 3))

In [44]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=0)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1500, 8), (500, 8), (1500, 3), (500, 3))

In [47]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [49]:
from sklearn import metrics

metrics.mean_absolute_error(y_test,y_pred)

0.24780504290964134