In [250]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
import seaborn as sns
from sklearn.model_selection import (train_test_split as tts,
                                     RandomizedSearchCV as rscv,
                                     cross_validate as cv)

from  sklearn.preprocessing import (MinMaxScaler, 
                                    StandardScaler)

from sklearn.linear_model import (LinearRegression,
                                  Ridge)
from sklearn.ensemble import (RandomForestRegressor,
                              GradientBoostingRegressor,
                              AdaBoostRegressor)

from sklearn.impute import SimpleImputer

from  sklearn.pipeline import  (Pipeline,
                                make_pipeline,
)

from sklearn import metrics

from sklearn.compose import (ColumnTransformer)

import warnings
from sklearn import set_config

In [132]:
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
set_config(display="diagram")

In [133]:
df=pd.read_csv("/Users/chenyenpin/Documents/test/MKT.csv")
df

Unnamed: 0,youtube,facebook,newspaper,sales
0,84.72,19.20,48.96,12.60
1,351.48,33.96,51.84,25.68
2,135.48,20.88,46.32,14.28
3,116.64,1.80,36.00,11.52
4,318.72,24.00,0.36,20.88
...,...,...,...,...
166,45.84,4.44,16.56,9.12
167,113.04,5.88,9.72,11.64
168,212.40,11.16,7.68,15.36
169,340.32,50.40,79.44,30.60


In [183]:
def check_df(dataset, head = 5):
    '''
    funcao gerada para printar no output o shape do nosso dataset, a info, as 5 primeiras linhas, 
    informacoes sobre valores nulos, a distirbuicao dos nossos dados e os quantiles
    :param dataset : Pandas DataFrame
    :param head :int , default =5. Numero de linhas que vai ser 
    mostrado
    '''
    print('#'*30 + 'Shape of Dataset' + '#'*30, end = '\n')
    print(dataset.shape, end = '\n')
    print('#'*30 + 'General informations about to Dataset' + '#'*30, end = '\n')
    print(dataset.info(), end = '\n')
    print('#'*30 + 'First 5 Lines Of Dataset' + '#'*30, end = '\n')
    print(dataset.head(head), end = '\n'*2)
    print(print('#'*30 + 'Is Nan data' + '#'*30, end = '\n'))
    print(dataset.isna().sum(), end = '\n'*2)
    print('#'*30 + 'Quantiles of Numerical Features' + '#'*30, end ='\n')
    print(dataset.describe([0,0.10, 0.25, 0.50,0.75,0.99]).T, end = '\n')



def calculate_metrics(model,x_test, y_test):
    ''' 
    funcao que vai te retornar as metricas necessarias par aanalisar a qualidade do modelo treinado
    :param model: modelo o seu dataset
    :param x_test: x_test
    :param y_test: y_test
    retorna os valores da sua metricas em forma de um dicionario
    '''
    pred=model.predict(x_test)
    evaluation={
        "R2":metrics.r2_score(y_test, pred),
        "Mae":metrics.mean_absolute_error(y_test, pred),
        "Mse":metrics.mean_squared_error(y_test, pred),
        "RMse":np.sqrt(metrics.mean_squared_error(y_test,pred))
    }
    return evaluation

In [135]:
check_df(df)

##############################Shape of Dataset##############################
(171, 4)
##############################General informations about to Dataset##############################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   youtube    171 non-null    float64
 1   facebook   171 non-null    float64
 2   newspaper  171 non-null    float64
 3   sales      171 non-null    float64
dtypes: float64(4)
memory usage: 5.5 KB
None
##############################First 5 Lines Of Dataset##############################
   youtube  facebook  newspaper  sales
0    84.72     19.20      48.96  12.60
1   351.48     33.96      51.84  25.68
2   135.48     20.88      46.32  14.28
3   116.64      1.80      36.00  11.52
4   318.72     24.00       0.36  20.88

##############################Is Nan data##############################
None
youtube      0
facebook     0


In [144]:
x_col=["youtube","facebook","newspaper"]
y_col=["sales"]

In [136]:
FEATURES=df.drop(columns="sales")
TARGET=df["sales"]


In [137]:
TARGET

Unnamed: 0,sales
0,12.60
1,25.68
2,14.28
3,11.52
4,20.88
...,...
166,9.12
167,11.64
168,15.36
169,30.60


In [165]:
num_col=df.select_dtypes("number").columns.tolist()
num_col

['youtube', 'facebook', 'newspaper', 'sales']

In [146]:
x_train, x_test, y_train, y_test=tts (df[x_col], df[y_col], random_state=101,test_size=0.25 )

In [161]:
y_train

Unnamed: 0,sales
91,18.60
114,12.48
50,13.20
64,26.64
82,26.16
...,...
63,23.28
70,20.64
81,16.08
11,19.92


In [140]:
df

Unnamed: 0,youtube,facebook,newspaper,sales
0,84.72,19.20,48.96,12.60
1,351.48,33.96,51.84,25.68
2,135.48,20.88,46.32,14.28
3,116.64,1.80,36.00,11.52
4,318.72,24.00,0.36,20.88
...,...,...,...,...
166,45.84,4.44,16.56,9.12
167,113.04,5.88,9.72,11.64
168,212.40,11.16,7.68,15.36
169,340.32,50.40,79.44,30.60


In [218]:
x_trans = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer([
    ("preprocess", x_trans, x_col)
])

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

pipe

In [219]:
pipe.fit(x_train, y_train)


In [221]:
metric=(calculate_metrics(pipe, x_test, y_test))

In [222]:
metric

{'R2': 0.8628173523736165,
 'Mae': 1.399219538927246,
 'Mse': 5.055037016634923,
 'RMse': 2.248340947595565}

In [237]:
models={
    "lr": LinearRegression(),
    "ridge": Ridge(),
    "rf": RandomForestRegressor(),
    "ada": AdaBoostRegressor()
}

results=[]

for model_name , modelo in models.items():
    print(modelo)
    pipe.set_params(model=modelo).fit(x_train, y_train)
    # Predicting the Test set results
    result=calculate_metrics(pipe, x_test, y_test)
    # Appending the result of each model to a list containing all the results
    results.append((result))
df_results=pd.DataFrame(results).T
df_results.columns=["lr","ridge","rf", "ada"]
df_results


LinearRegression()
Ridge()


RandomForestRegressor()
AdaBoostRegressor()


Unnamed: 0,lr,ridge,rf,ada
R2,0.862817,0.862903,0.961305,0.930289
Mae,1.39922,1.394158,0.812121,1.234831
Mse,5.055037,5.051872,1.42587,2.568787
RMse,2.248341,2.247637,1.194098,1.602743


In [256]:
models={
    "lr": LinearRegression(),
    "ridge": Ridge(),
    "rf": RandomForestRegressor(),
    "ada": AdaBoostRegressor()}


cross_results=[]

for model_name, modelo in models.items():
    pipe.set_params(model=modelo).fit(x_train, y_train)
    cross=cv(pipe, x_train, y_train, n_jobs=-1, cv=5, verbose=1,scoring=(['r2']))
    cross_result=pd.DataFrame(cross).mean().to_frame(model_name).T
    cross_results.append(cross_result)

cross_results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


[    fit_time  score_time   test_r2
 lr  0.027786    0.013491  0.902881,
        fit_time  score_time   test_r2
 ridge  0.019219    0.009525  0.902798,
     fit_time  score_time   test_r2
 rf  0.699787    0.025687  0.977581,
      fit_time  score_time   test_r2
 ada  0.237702    0.045296  0.956902]

In [258]:
df_cross=pd.concat(cross_results)
df_cross.sort_values("test_r2", ascending=False)

Unnamed: 0,fit_time,score_time,test_r2
rf,0.699787,0.025687,0.977581
ada,0.237702,0.045296,0.956902
lr,0.027786,0.013491,0.902881
ridge,0.019219,0.009525,0.902798


In [265]:
params={
"model":[
    LinearRegression(),
    Ridge(),
    RandomForestRegressor(),
    AdaBoostRegressor]}


random=rscv(pipe, params, cv=5, scoring="r2", verbose=1)
random.fit(x_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [266]:
random.best_estimator_

In [269]:
df_random=pd.DataFrame(random.cv_results_).sort_values("rank_test_score")

In [270]:

df_random.loc[:,~df_random.columns.str.contains("split|time")]
     

Unnamed: 0,param_model,params,mean_test_score,std_test_score,rank_test_score
2,RandomForestRegressor(),{'model': RandomForestRegressor()},0.978002,0.010556,1
0,LinearRegression(),{'model': LinearRegression()},0.902881,0.033575,2
1,Ridge(),{'model': Ridge()},0.902798,0.033538,3
3,<class 'sklearn.ensemble._weight_boosting.AdaB...,{'model': <class 'sklearn.ensemble._weight_boo...,,,4


In [298]:
pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestRegressor())
])

In [302]:
params={
    "model__max_features": [1, 2, 3, 5, None],
    "model__max_leaf_nodes": [10, 100, 1000, None],
    "model__min_samples_leaf": [1, 2, 5, 10, 20, 50, 100]}

In [303]:
pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestRegressor())
])

In [305]:
rf_tuned=rscv(pipe_rf, param_distributions=params, cv=3, scoring="r2", n_jobs=-1, verbose=1)
rf_tuned.fit(x_train, y_train)



Fitting 3 folds for each of 10 candidates, totalling 30 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [306]:

df_random_final=pd.DataFrame(rf_tuned.cv_results_).set_index("rank_test_score").sort_index()
df_random_final.loc[:,~df_random_final.columns.str.contains("split|time")].head(1)
     

Unnamed: 0_level_0,param_model__min_samples_leaf,param_model__max_leaf_nodes,param_model__max_features,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2,1000,,"{'model__min_samples_leaf': 2, 'model__max_lea...",0.968782,0.011175


In [307]:
pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(min_samples_leaf=2, max_leaf_nodes=1000, max_features=None))
])

pipe_rf.fit(x_train, y_train)

calculate_metrics(pipe_rf,x_test, y_test)

{'R2': 0.9536896864461751,
 'Mae': 0.8703843199436203,
 'Mse': 1.7064865951860464,
 'RMse': 1.3063256084093453}