In [231]:
import numpy as np
import sys
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier
from keras.models import Sequential  
from keras.layers import Dense, Dropout  
from types import new_class
from keras.layers import Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor


In [232]:
# importamos a drive a google colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [233]:
# cargamos nuestro dataset
data_patch = '/content/drive/MyDrive/Colab Notebooks/supermarket_sales.csv'
dataset = pd.read_csv(data_patch)
dataset.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [234]:
# vemos que no tenemos datos nulos
dataset.isnull().sum()

Invoice ID                 0
Branch                     0
City                       0
Customer type              0
Gender                     0
Product line               0
Unit price                 0
Quantity                   0
Tax 5%                     0
Total                      0
Date                       0
Time                       0
Payment                    0
cogs                       0
gross margin percentage    0
gross income               0
Rating                     0
dtype: int64

In [235]:
dataset.shape

(1000, 17)

In [236]:
# sustituimos todos los espacio en blanco de las entradas por _ (guion bajo)
dataset.columns = dataset.columns.str.lower().str.replace(" ","_")


In [237]:
strings = list(dataset.dtypes[dataset.dtypes == 'object'].index)
for col in strings:
  dataset[col] = dataset[col].str.lower().str.replace(" ", "_")

In [238]:
dataset

Unnamed: 0,invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5%,total,date,time,payment,cogs,gross_margin_percentage,gross_income,rating
0,750-67-8428,a,yangon,member,female,health_and_beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,c,naypyitaw,normal,female,electronic_accessories,15.28,5,3.8200,80.2200,3/8/2019,10:29,cash,76.40,4.761905,3.8200,9.6
2,631-41-3108,a,yangon,normal,male,home_and_lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,credit_card,324.31,4.761905,16.2155,7.4
3,123-19-1176,a,yangon,member,male,health_and_beauty,58.22,8,23.2880,489.0480,1/27/2019,20:33,ewallet,465.76,4.761905,23.2880,8.4
4,373-73-7910,a,yangon,normal,male,sports_and_travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,c,naypyitaw,normal,male,health_and_beauty,40.35,1,2.0175,42.3675,1/29/2019,13:46,ewallet,40.35,4.761905,2.0175,6.2
996,303-96-2227,b,mandalay,normal,female,home_and_lifestyle,97.38,10,48.6900,1022.4900,3/2/2019,17:16,ewallet,973.80,4.761905,48.6900,4.4
997,727-02-1313,a,yangon,member,male,food_and_beverages,31.84,1,1.5920,33.4320,2/9/2019,13:22,cash,31.84,4.761905,1.5920,7.7
998,347-56-2442,a,yangon,normal,male,home_and_lifestyle,65.82,1,3.2910,69.1110,2/22/2019,15:33,cash,65.82,4.761905,3.2910,4.1


In [239]:
# One Hot Encoding de gender
dataset = pd.get_dummies(data=dataset, columns=['gender'])
# One Hot Encoding de product_line
dataset =  pd.get_dummies(data=dataset, columns=['product_line'])
# One Hot Encoding de branch
dataset =  pd.get_dummies(data=dataset, columns=['branch'])
# One Hot Encoding de city
dataset =  pd.get_dummies(data=dataset, columns=['city'])
# One Hot Encoding de customer_type
dataset =  pd.get_dummies(data=dataset, columns=['customer_type'])
# One Hot Encoding de payment
dataset =  pd.get_dummies(data=dataset, columns=['payment'])

In [240]:
dataset

Unnamed: 0,invoice_id,unit_price,quantity,tax_5%,total,date,time,cogs,gross_margin_percentage,gross_income,...,branch_b,branch_c,city_mandalay,city_naypyitaw,city_yangon,customer_type_member,customer_type_normal,payment_cash,payment_credit_card,payment_ewallet
0,750-67-8428,74.69,7,26.1415,548.9715,1/5/2019,13:08,522.83,4.761905,26.1415,...,0,0,0,0,1,1,0,0,0,1
1,226-31-3081,15.28,5,3.8200,80.2200,3/8/2019,10:29,76.40,4.761905,3.8200,...,0,1,0,1,0,0,1,1,0,0
2,631-41-3108,46.33,7,16.2155,340.5255,3/3/2019,13:23,324.31,4.761905,16.2155,...,0,0,0,0,1,0,1,0,1,0
3,123-19-1176,58.22,8,23.2880,489.0480,1/27/2019,20:33,465.76,4.761905,23.2880,...,0,0,0,0,1,1,0,0,0,1
4,373-73-7910,86.31,7,30.2085,634.3785,2/8/2019,10:37,604.17,4.761905,30.2085,...,0,0,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,40.35,1,2.0175,42.3675,1/29/2019,13:46,40.35,4.761905,2.0175,...,0,1,0,1,0,0,1,0,0,1
996,303-96-2227,97.38,10,48.6900,1022.4900,3/2/2019,17:16,973.80,4.761905,48.6900,...,1,0,1,0,0,0,1,0,0,1
997,727-02-1313,31.84,1,1.5920,33.4320,2/9/2019,13:22,31.84,4.761905,1.5920,...,0,0,0,0,1,1,0,1,0,0
998,347-56-2442,65.82,1,3.2910,69.1110,2/22/2019,15:33,65.82,4.761905,3.2910,...,0,0,0,0,1,0,1,1,0,0


In [241]:
# encontrando las variables categoricas en el dataset
dataset.select_dtypes(include = 'object')

Unnamed: 0,invoice_id,date,time
0,750-67-8428,1/5/2019,13:08
1,226-31-3081,3/8/2019,10:29
2,631-41-3108,3/3/2019,13:23
3,123-19-1176,1/27/2019,20:33
4,373-73-7910,2/8/2019,10:37
...,...,...,...
995,233-67-5758,1/29/2019,13:46
996,303-96-2227,3/2/2019,17:16
997,727-02-1313,2/9/2019,13:22
998,347-56-2442,2/22/2019,15:33


In [242]:
# descartando variables del dataset
dataset.drop(['invoice_id', 'time', 'date'], axis = 1, inplace = True)

In [243]:
# variables numericas
dataset.select_dtypes(include = 'number')

Unnamed: 0,unit_price,quantity,tax_5%,total,cogs,gross_margin_percentage,gross_income,rating,gender_female,gender_male,...,branch_b,branch_c,city_mandalay,city_naypyitaw,city_yangon,customer_type_member,customer_type_normal,payment_cash,payment_credit_card,payment_ewallet
0,74.69,7,26.1415,548.9715,522.83,4.761905,26.1415,9.1,1,0,...,0,0,0,0,1,1,0,0,0,1
1,15.28,5,3.8200,80.2200,76.40,4.761905,3.8200,9.6,1,0,...,0,1,0,1,0,0,1,1,0,0
2,46.33,7,16.2155,340.5255,324.31,4.761905,16.2155,7.4,0,1,...,0,0,0,0,1,0,1,0,1,0
3,58.22,8,23.2880,489.0480,465.76,4.761905,23.2880,8.4,0,1,...,0,0,0,0,1,1,0,0,0,1
4,86.31,7,30.2085,634.3785,604.17,4.761905,30.2085,5.3,0,1,...,0,0,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,40.35,1,2.0175,42.3675,40.35,4.761905,2.0175,6.2,0,1,...,0,1,0,1,0,0,1,0,0,1
996,97.38,10,48.6900,1022.4900,973.80,4.761905,48.6900,4.4,1,0,...,1,0,1,0,0,0,1,0,0,1
997,31.84,1,1.5920,33.4320,31.84,4.761905,1.5920,7.7,0,1,...,0,0,0,0,1,1,0,1,0,0
998,65.82,1,3.2910,69.1110,65.82,4.761905,3.2910,4.1,0,1,...,0,0,0,0,1,0,1,1,0,0


In [244]:
X = dataset.loc[:, dataset.columns != 'quantity']
y = dataset.loc[:, dataset.columns == 'quantity']

In [245]:
# dividiendo la data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, shuffle = True,  random_state = 1)

X_train.shape, X_test.shape

((800, 26), (200, 26))

In [246]:
# diviendo por 2da vez los datos para test y training
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size= 0.2, shuffle = True,  random_state = 1)

#normalizando la data
std_scaler = StandardScaler()

# partionamos los datos 80/20 
X_train2 = std_scaler.fit_transform(X_train2)
X_test2 = std_scaler.transform(X_test2)




In [247]:
# creamos nuestro parametros para generar las 4 arquitecturas
learnRate = [0.1,0.2,0.5]
batchSize = [14,10,20,25]
epochs = [30,20]
activation_fn = ['relu']
layers = [(15,10),(15),(30,20)]
n_inputs= [X_train2.shape[1]]


In [248]:
paramgrid = dict(learnRate = learnRate, batch_size = batchSize, epochs = epochs, activation_fn = activation_fn, layers = layers, n_inputs = n_inputs)
paramgrid

{'activation_fn': ['relu'],
 'batch_size': [14, 10],
 'epochs': [30, 20],
 'layers': [14, 15],
 'learnRate': [0.1],
 'n_inputs': [26]}

In [249]:
X_train2.shape

(640, 26)

In [250]:

def create_dynamicNeuralNet (n_inputs, activation_fn, learnRate, layers):

  modelov1 = Sequential()
  modelov1.add(Dense(input_dim = n_inputs, units = 7, kernel_initializer='uniform', activation = activation_fn ))
  modelov1.add(Dense(units = layers, kernel_initializer='uniform', activation = activation_fn )) 
  #modelov1.add(Dense(units = layers, kernel_initializer='uniform', activation = activation_fn ))

  modelov1.add(Dense(units = 1, kernel_initializer='uniform', activation = activation_fn ))


  
  modelov1.compile(optimizer = Adam(learnRate), loss='mean_squared_error', metrics = ['mean_squared_error']) 

  return modelov1  

In [251]:
# generamos nuestro modelo
mymodel = KerasRegressor(build_fn=create_dynamicNeuralNet, verbose = 0)

  """Entry point for launching an IPython kernel.


In [252]:
# generamos
grid_out = GridSearchCV(estimator=mymodel, param_grid=paramgrid, cv=10)

In [253]:
grid_out

GridSearchCV(cv=10,
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7fc1efb742d0>,
             param_grid={'activation_fn': ['relu'], 'batch_size': [14, 10],
                         'epochs': [30, 20], 'layers': [14, 15],
                         'learnRate': [0.1], 'n_inputs': [26]})

In [254]:
grid_resul  = grid_out.fit(X_train, y_train)

In [255]:
grid_out.best_score_

-13.974037909507752

In [256]:
# vemos nuestra mejor combinacion de parametros
grid_out.best_params_

{'activation_fn': 'relu',
 'batch_size': 10,
 'epochs': 20,
 'layers': 15,
 'learnRate': 0.1,
 'n_inputs': 26}

In [267]:
# calculando nuestra prediccion
y_pred = grid_resul.predict(X_test) 

(200, 1)

In [276]:
# calculndo el MSE
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
mse = mean_squared_error(y_test, y_pred)
mse

9.006869423746402

In [298]:
# calculando el RMSE
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
import math
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
rmse

3.0011446855735566

In [278]:
# calculando el r2.
r2 = r2_score(y_test, y_pred)
r2

-0.029356505571017433