# **C23-PR487 ~ LapakIn**

> **Assumption**  
> Successful MSMEs defined by: Rating >= 4 and Review >= 6 on Google Maps 

# Setup

In [182]:
import tensorflow as tf
import numpy as np
import pandas as pd
import joblib
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense

In [183]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/C23PR487/Capstone-Project/main/ML/successful_msme.csv')

# EDA & Preprocessing

In [184]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 635 entries, 0 to 634
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   url                      635 non-null    object 
 1   nama toko                635 non-null    object 
 2   rating                   635 non-null    float64
 3   jumlah review            635 non-null    int64  
 4   kategori                 635 non-null    object 
 5   alamat                   635 non-null    object 
 6   kota                     635 non-null    object 
 7   jumlah mall terdekat     635 non-null    int64  
 8   jumlah kantor terdekat   635 non-null    int64  
 9   jumlah sekolah terdekat  635 non-null    int64  
 10  nama_mall                494 non-null    object 
 11  nama_kantor              454 non-null    object 
 12  nama_sekolah             581 non-null    object 
 13  label                    635 non-null    object 
dtypes: float64(1), int64(4), o

In [185]:
df = df_raw[['jumlah mall terdekat', 'jumlah kantor terdekat', 'jumlah sekolah terdekat', 'label']]

In [186]:
df.head()

Unnamed: 0,jumlah mall terdekat,jumlah kantor terdekat,jumlah sekolah terdekat,label
0,1,4,8,toko_kopi
1,6,7,2,toko_kopi
2,3,1,10,toko_kopi
3,0,4,2,toko_kopi
4,1,4,3,toko_kopi


In [187]:
df.describe()

Unnamed: 0,jumlah mall terdekat,jumlah kantor terdekat,jumlah sekolah terdekat
count,635.0,635.0,635.0
mean,2.508661,2.259843,5.132283
std,2.480429,2.36434,3.459748
min,0.0,0.0,0.0
25%,1.0,0.0,2.0
50%,2.0,2.0,5.0
75%,4.0,4.0,7.0
max,13.0,12.0,16.0


## Check Duplicate

In [188]:
df_raw.duplicated().sum()

0

## Check Null Value

In [189]:
df.isnull().sum()

jumlah mall terdekat       0
jumlah kantor terdekat     0
jumlah sekolah terdekat    0
label                      0
dtype: int64

## Check Outlier

In [190]:
def compute_reasonable_bound(selected_attribute):
    Q1 = selected_attribute.quantile(0.25)
    Q3 = selected_attribute.quantile(0.75)
    IQR = Q3-Q1
    return [Q3 + (1.5*IQR), Q1 - (1.5*IQR)]
    
def compute_outliers(attribute, dataset):
    selected_attribute = dataset.sort_values(by=[attribute])[attribute]
    [RUB, RLB] = compute_reasonable_bound(selected_attribute)
    return len(dataset[(dataset[attribute] > RUB) | (dataset[attribute] < RLB)])

def check_outlier(dataset):
    attributes = []
    outliers = []
    for attribute in df:
        if dataset[attribute].dtype != 'O':
            many_outliers = compute_outliers(attribute, dataset)
            percentage_of_outlier = (many_outliers/len(dataset[attribute])) * 100
            attributes.append(attribute)
            outliers.append(percentage_of_outlier)

    data = {'attribute': attributes, 'percentage of outlier (%)': outliers}
    outlier_df = pd.DataFrame(data)
    return outlier_df

In [191]:
check_outlier(df)

Unnamed: 0,attribute,percentage of outlier (%)
0,jumlah mall terdekat,4.409449
1,jumlah kantor terdekat,0.15748
2,jumlah sekolah terdekat,0.629921


## Encode Label

In [192]:
label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = label_encoder.fit_transform(df['label'])


In [193]:
list(label_encoder.inverse_transform([0, 1, 2, 3, 4]))

['toko_kopi', 'usaha_baju', 'usaha_fotokopi', 'usaha_laundry', 'usaha_makanan']

## Split Data

In [194]:
X = df.drop(['label'],axis=1)
y = df['label']

In [195]:
# Get 80% of the dataset as the training set. Put the remaining 20% as cross validation set
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.20, random_state=1)

## Normalize

In [196]:
X_train.describe()

Unnamed: 0,jumlah mall terdekat,jumlah kantor terdekat,jumlah sekolah terdekat
count,508.0,508.0,508.0
mean,2.356299,2.104331,5.088583
std,2.32494,2.266065,3.419117
min,0.0,0.0,0.0
25%,1.0,0.0,2.0
50%,2.0,1.0,5.0
75%,3.0,3.0,7.0
max,13.0,12.0,16.0


In [197]:
scaler = StandardScaler()
scaler.fit(X_train)

columns = ['jumlah mall terdekat', 'jumlah kantor terdekat', 'jumlah sekolah terdekat']
X_train = pd.DataFrame(scaler.transform(X_train), columns=columns)
X_cv = pd.DataFrame(scaler.transform(X_cv), columns=columns)

In [198]:
# dump(scaler, open('lapakin_scaler.pkl', 'wb'))
joblib.dump(scaler, 'lapakin_scaler.pkl')

['lapakin_scaler.pkl']

# Modeling

In [199]:
model = Sequential(
    [
        Dense(100, activation = 'relu'),
        Dense(50, activation = 'relu'),
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(5, activation = 'linear')
    ]
)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

model.fit(
    X_train,y_train,
    epochs=600
)

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

<keras.callbacks.History at 0x7f64c9638b50>

In [200]:
def get_predicted_label(y_result):
  rows = y_result.shape[0]
  y_pred = np.zeros(rows)
  for row in range(rows):
      y_pred[row] = np.argmax(y_result[row])
  return y_pred

In [201]:
y_result = model.predict(X_train)
y_pred = get_predicted_label(y_result)
print(f"training MSE (using sklearn function): {mean_squared_error(y_train, y_pred) / 2}")

training MSE (using sklearn function): 1.0580708661417322


In [202]:
y_result = model.predict(X_cv)
y_pred = get_predicted_label(y_result)
print(f"cv MSE (using sklearn function): {mean_squared_error(y_cv, y_pred) / 2}")

cv MSE (using sklearn function): 2.578740157480315


In [203]:
X_cv_new_sample = pd.DataFrame([[12, 11, 2],[11, 4, 2],[3, 8, 12]], columns = ['jumlah mall terdekat', 'jumlah kantor terdekat', 'jumlah sekolah terdekat'], dtype = float)
X_cv_new_sample = scaler.transform(X_cv_new_sample)
y_result_new_sample = model.predict(X_cv_new_sample)
y_pred_new_sample = get_predicted_label(y_result_new_sample)
y_pred_new_sample



array([0., 3., 2.])

In [204]:
model.save('lapakin_model.h5')

# Prediction

In [205]:
import numpy as np
import pandas as pd
import joblib
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from keras.models import load_model

# label = ['toko_kopi', 'usaha_baju', 'usaha_fotokopi', 'usaha_laundry', 'usaha_makanan']

def get_predicted_label(y_result):
  rows = y_result.shape[0]
  y_pred = np.zeros(rows)
  for row in range(rows):
      y_pred[row] = np.argmax(y_result[row])
  return y_pred

model = load_model('./lapakin_model.h5')
scaler = joblib.load('lapakin_scaler.pkl') 

X_cv_new_sample = pd.DataFrame([[12, 11, 2],[11, 4, 2],[3, 8, 12]], columns = ['jumlah mall terdekat', 'jumlah kantor terdekat', 'jumlah sekolah terdekat'], dtype = float)
X_cv_new_sample = scaler.transform(X_cv_new_sample)

y_result_new_sample = model.predict(X_cv_new_sample)
y_pred_new_sample = get_predicted_label(y_result_new_sample)
print(y_pred_new_sample)

[0. 3. 2.]
