## SETTING UP DATASET

In [2]:
import pandas as pd
from utils.tfidf import *
import numpy as np

df = pd.read_csv("./data/data_train5.csv", delimiter=",")
df.head()

Unnamed: 0,no,prodi,minat,bakat,mapel,mtk,biologi,fisika,kimia
0,0,akuntansi,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['sejarah', 'geografi']",0,0,0,0
1,1,teknik informatika,"['minat', 'aktivitas', 'laku', 'luar', 'ruang'...","['bakat', 'aktivitas', 'kait', 'musik', 'baik'...",['sejarah'],0,0,0,0
2,2,teknik informatika,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['olah', 'video']",0,0,0,0
3,3,akuntansi,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['mtk', 'kimia', 'bahasa', 'indonesia']",0,0,0,0
4,4,teknik informatika,"['minat', 'aktivitas', 'hubung', 'mesin', 'ala...","['bakat', 'aktivitas', 'hubung', 'mesin', 'ala...","['biologi', 'bahasa', 'inggris', 'bahasa', 'in...",0,0,0,0


In [3]:
df_minat = pd.read_csv('./data/tfidf_minat.csv')
df_bakat = pd.read_csv('./data/tfidf_bakat.csv')
df_mapel = pd.read_csv('./data/tfidf_mapel.csv')

In [4]:
prodi = df['prodi'].tolist()
df['prodi'] = [p.strip() for p in prodi]

In [5]:
def tfidf_mean(string, df):
  tfidf_value = evaluate_tfidf(string, df)
  return np.mean(list(tfidf_value.values()))

In [6]:
df['minat_score'] = [tfidf_mean(minat, df_minat) for minat in df['minat']]
df['bakat_score'] = [tfidf_mean(bakat, df_bakat) for bakat in df['bakat']]
df['mapel_score'] = [tfidf_mean(mapel, df_mapel) for mapel in df['mapel']]

In [7]:
df.head()

Unnamed: 0,no,prodi,minat,bakat,mapel,mtk,biologi,fisika,kimia,minat_score,bakat_score,mapel_score
0,0,akuntansi,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['sejarah', 'geografi']",0,0,0,0,0.2859,0.309112,6.550802
1,1,teknik informatika,"['minat', 'aktivitas', 'laku', 'luar', 'ruang'...","['bakat', 'aktivitas', 'kait', 'musik', 'baik'...",['sejarah'],0,0,0,0,0.217867,0.325761,10.294118
2,2,teknik informatika,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['olah', 'video']",0,0,0,0,0.198672,0.239341,87.5
3,3,akuntansi,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['mtk', 'kimia', 'bahasa', 'indonesia']",0,0,0,0,0.2859,0.207605,0.881995
4,4,teknik informatika,"['minat', 'aktivitas', 'hubung', 'mesin', 'ala...","['bakat', 'aktivitas', 'hubung', 'mesin', 'ala...","['biologi', 'bahasa', 'inggris', 'bahasa', 'in...",0,0,0,0,0.092461,0.102315,0.743029


In [8]:
df = df.drop(['minat', 'bakat', 'mapel'], axis=1)
df.head()

Unnamed: 0,no,prodi,mtk,biologi,fisika,kimia,minat_score,bakat_score,mapel_score
0,0,akuntansi,0,0,0,0,0.2859,0.309112,6.550802
1,1,teknik informatika,0,0,0,0,0.217867,0.325761,10.294118
2,2,teknik informatika,0,0,0,0,0.198672,0.239341,87.5
3,3,akuntansi,0,0,0,0,0.2859,0.207605,0.881995
4,4,teknik informatika,0,0,0,0,0.092461,0.102315,0.743029


In [9]:
df['mtk'] = [float(score.replace(",", ".")) for score in df['mtk']]
df['biologi'] = [float(score.replace(",", ".")) for score in df['biologi']]
df['kimia'] = [float(score.replace(",", ".")) for score in df['kimia']]
df['fisika'] = [float(score.replace(",", ".")) for score in df['fisika']]

In [10]:
# Generate clean dataset for training

df.to_csv("./data/data_train5_clean.csv")

## TRAINING

In [28]:
df = pd.read_csv("./data/data_train5_clean.csv")

In [29]:
# Preprocessing data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X = df[['mtk', 'biologi', 'fisika', 'kimia', 'minat_score', 'bakat_score', 'mapel_score']]
y = df['prodi']

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [30]:
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)
classifier.score(X_train, y_train)

1.0

In [38]:
classifier.n_features_in_

7

In [39]:
classifier.predict(X_test)

array([45, 45, 45, 45, 40, 15, 42, 45, 45, 45,  2, 15, 45,  2, 45,  2,  2,
       45, 45, 45, 42,  2,  9, 42,  2, 45,  1,  2, 45, 48, 48, 40, 42, 48,
       42,  1, 15,  2, 20, 42, 46,  9,  2,  2,  2, 45, 48,  2,  2,  9, 45,
       48, 40])

In [40]:
sum(classifier.predict(X_test) == y_test) / len(y_test)

0.18867924528301888