# Config

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


# Preprocess

In [3]:
import os
import pandas as pd

# Check if the file already exists
if not os.path.exists("umkm.csv"):
    !gdown --id 182o4f8pLuWfFC1RqdXuN6f7coiuGPlnv
# Load the data
data = pd.read_csv('umkm.csv', sep=',', encoding='latin1')


## Data Cleaning

In [4]:
# Mengonversi kolom object yang seharusnya menjadi numerik
float_columns = ['TOTAL_ASET', 'PENJUALAN_TAHUN', 'KEBUTUHAN_PEMBIAYAAN']
for col in float_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

int_columns = ['KABUPATEN', 'PROVINSI', 'SEKTOR', 'TENAGA_KERJA']
for col in int_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce', downcast='integer')

# Drop missing value
data.dropna(inplace=True)

## Feature Engineering

In [5]:
# Handle missing values and encode categorical variables
categorical_columns = ['NAMA', 'PENGUSAHA', 'USAHA_UTAMA', 'PRODUK_UTAMA', 'SEKTOR', 'KEBUTUHAN_PEMBIAYAAN_CLASS']

# Encode categorical variables
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

In [6]:
def classify_umkm(df):
    # Define thresholds for total assets (based on 5 quantiles)
    asset_bins = pd.qcut(df['TOTAL_ASET'], q=5, labels=False)
    def categorize_employees(employees):
        if employees <= 2:
            return 0  # Micro
        elif employees <= 5:
            return 1  # Small
        elif employees <= 10:
            return 2  # Medium
        elif employees <= 20:
            return 3  # Large
        else:
            return 4  # Very Large

    employee_cats = df['TENAGA_KERJA'].apply(categorize_employees)

    # Combine asset and employee categories
    # This creates a weighted classification
    # Formula: (asset_category * 0.6) + (employee_category * 0.4)
    umkm_class = np.round((asset_bins * 0.6 + employee_cats * 0.4)).astype(int)

    # Ensure the class is between 0-4
    umkm_class = np.clip(umkm_class, 0, 4)
    return umkm_class

data['UMKM_CLASS'] = classify_umkm(data)


In [7]:
data.drop(['KEBUTUHAN_PEMBIAYAAN', 'PRODUK_UTAMA', 'NAMA', 'PENGUSAHA', 'USAHA_UTAMA', 'TAHUN_PENELITIAN', 'TOTAL_ASET', 'TENAGA_KERJA', 'PENJUALAN_TAHUN'], axis=1, inplace=True)

In [13]:
data["SEKTOR"].unique()

array([ 0, 12, 13, 14, 15, 16, 18,  4,  6,  7, 11,  9,  8, 17,  1,  2,  3,
        5, 10])

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2247 entries, 0 to 2268
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   KABUPATEN                   2247 non-null   int16  
 1   PROVINSI                    2247 non-null   int8   
 2   SEKTOR                      2247 non-null   int64  
 3   KEBUTUHAN_PEMBIAYAAN_CLASS  2247 non-null   int64  
 4   SENTIMENT                   2247 non-null   float64
 5   UMKM_ID                     2247 non-null   int64  
 6   UMKM_CLASS                  2247 non-null   int64  
dtypes: float64(1), int16(1), int64(4), int8(1)
memory usage: 111.9 KB


In [10]:
data.head()

Unnamed: 0,KABUPATEN,PROVINSI,SEKTOR,KEBUTUHAN_PEMBIAYAAN_CLASS,SENTIMENT,UMKM_ID,UMKM_CLASS
0,7371,28,0,2,7.32,0,4
1,7210,27,0,2,3.57,1,2
2,6472,22,0,2,1.35,2,2
3,3312,15,0,2,1.73,3,1
4,7310,28,0,2,7.9,4,2


# Training model

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

df = data.drop(columns=['UMKM_ID'])

# Bobot fitur
feature_weights = {
    'KABUPATEN': 0.2,
    'PROVINSI': 0.2,
    'SEKTOR': 0.2,
    'KEBUTUHAN_PEMBIAYAAN_CLASS': 0.15,
    'SENTIMENT': 0.15,
    'UMKM_CLASS': 0.1
}

# Fungsi rekomendasi
def recommend_umkm(user_input, top_n=5):
    # Encode input pengguna
    user_profile = [
        user_input['KABUPATEN'],
        user_input['PROVINSI'],
        user_input['PREFERRED_SEKTOR'],
        user_input['MODAL'],
        0.5,  # Default sentiment
        user_input['PREFERRED_UMKM_CLASS']
    ]

    # Normalisasi dan bobot
    user_profile_scaled = [
        val * weight for val, weight in zip(user_profile, list(feature_weights.values()))
    ]

    # Hitung similaritas
    feature_matrix = df[list(feature_weights.keys())].values * list(feature_weights.values())
    similarities = cosine_similarity(
        np.array(user_profile_scaled).reshape(1, -1),
        feature_matrix
    )[0]

    # Ambil top N rekomendasi
    top_indices = similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices]


# Example of use

In [26]:
# Contoh penggunaan
user_data = {
    "KABUPATEN": 1,
    "PROVINSI": 2,
    "PREFERRED_SEKTOR": 3,
    "MODAL": 500000,
    "PREFERRED_UMKM_CLASS": 1
}

# Dapatkan rekomendasi
rekomendasi = recommend_umkm(user_data, top_n=10)
data_umkm = pd.read_csv('umkm.csv', sep=',', encoding='latin1')
data_umkm.loc[rekomendasi.index]

Unnamed: 0,TAHUN_PENELITIAN,NAMA,PENGUSAHA,KABUPATEN,PROVINSI,USAHA_UTAMA,PRODUK_UTAMA,SEKTOR,TENAGA_KERJA,TOTAL_ASET,PENJUALAN_TAHUN,KEBUTUHAN_PEMBIAYAAN,KEBUTUHAN_PEMBIAYAAN_CLASS,SENTIMENT,UMKM_ID
2031,2023,BUNGKES,Peti Vera,1106,1,Bubuk Kopi,olahan kopi,3,4,378.0,80.0,200.0,>100,6.92,2031
2032,2023,Rezki Batako,Rizkan Asyraf,1106,1,Mencetak BATAKO,Pembuatan BATAKO untuk Bangunan,3,5,541.5,672.0,100.0,>100,8.08,2032
1931,2023,Budidaya Udang Vaname,ZULKIFLI,1108,1,Seafood Bahan Mentah,Udang Vaname,1,8,141.2,180.0,160.0,>100,4.21,1931
2144,2023,Kak Na Gorden,ROSNA,1109,1,Toko Gorden,Gorden,7,2,1140.56,1362.0,101.25,>100,5.3,2144
1932,2023,Jual Beli Kambing M Nasir,M Nasir,1110,1,Jual Beli Kambing,Kambing,1,3,1376.0,5760.0,400.0,>100,7.24,1932
2145,2023,Ridha Car Interior,Muhammad Ridha,1110,1,Jok Mobil,Jok mobil,7,3,118.8,120.0,160.0,>100,6.42,2145
2146,2023,Cipta Karya Sembako,FIRDA,1112,1,Toko Kelontong Grosir,Sembako,7,5,354.5,556.92,120.0,>100,5.06,2146
2147,2023,UD Bangun,Latifatun Nizar,1114,1,Pupuk dan Pestisida,Alat Pertanian,7,3,1290.0,1080.0,200.0,>100,8.64,2147
1934,2023,Nano Nano,JUNAIDI,1117,1,Penanaman Cabai,Palawija,1,4,332.43,600.0,100.5,>100,2.52,1934
2260,2023,RHS Toko Kaca dan Alumunium,M ALI UMAR,1171,1,Kaca & Alumunium,Alumunium & Kaca,19,7,468.4,343.5,500.0,>100,2.61,2260
