## SETTING UP DATASET

In [1]:
import pandas as pd
from utils.tfidf import *
import numpy as np

df = pd.read_csv("./data/data_train2.csv", delimiter=",")
df.head()

ParserError: Error tokenizing data. C error: Expected 12 fields in line 5, saw 35


In [60]:
df_minat = pd.read_csv('./data/tfidf_minat.csv')
df_bakat = pd.read_csv('./data/tfidf_bakat.csv')
df_mapel = pd.read_csv('./data/tfidf_mapel.csv')

In [61]:
prodi = df['prodi'].tolist()
df['prodi'] = [p.strip() for p in prodi]

In [62]:
def tfidf_mean(string, df):
  tfidf_value = evaluate_tfidf(string, df)
  return np.mean(list(tfidf_value.values()))

In [63]:
df['minat_score'] = [tfidf_mean(minat, df_minat) for minat in df['minat']]
df['bakat_score'] = [tfidf_mean(bakat, df_bakat) for bakat in df['bakat']]
df['mapel_score'] = [tfidf_mean(mapel, df_mapel) for mapel in df['mapel']]

In [64]:
df.head()

Unnamed: 0,no,prodi,minat,bakat,mapel,mtk,biologi,fisika,kimia,minat_score,bakat_score,mapel_score
0,0,akuntansi,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['sejarah', 'geografi']",0,0,0,0,0.2859,0.309112,6.550802
1,1,teknik informatika,"['minat', 'aktivitas', 'laku', 'luar', 'ruang'...","['bakat', 'aktivitas', 'kait', 'musik', 'baik'...",['sejarah'],0,0,0,0,0.217867,0.325761,10.294118
2,2,teknik informatika,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['olah', 'video']",0,0,0,0,0.198672,0.239341,87.5
3,3,akuntansi,"['minat', 'aktivias', 'hubung', 'tugas', 'ruti...","['bakat', 'aktivias', 'hubung', 'tugas', 'ruti...","['mtk', 'kimia', 'bahasa', 'indonesia']",0,0,0,0,0.2859,0.207605,0.881995
4,4,teknik informatika,"['minat', 'aktivitas', 'hubung', 'mesin', 'ala...","['bakat', 'aktivitas', 'hubung', 'mesin', 'ala...","['biologi', 'bahasa', 'inggris', 'bahasa', 'in...",0,0,0,0,0.092461,0.102315,0.743029


In [65]:
df = df.drop(['minat', 'bakat', 'mapel'], axis=1)
df.head()

Unnamed: 0,no,prodi,mtk,biologi,fisika,kimia,minat_score,bakat_score,mapel_score
0,0,akuntansi,0,0,0,0,0.2859,0.309112,6.550802
1,1,teknik informatika,0,0,0,0,0.217867,0.325761,10.294118
2,2,teknik informatika,0,0,0,0,0.198672,0.239341,87.5
3,3,akuntansi,0,0,0,0,0.2859,0.207605,0.881995
4,4,teknik informatika,0,0,0,0,0.092461,0.102315,0.743029


In [66]:
df['mtk'] = [float(score.replace(",", ".")) for score in df['mtk']]
df['biologi'] = [float(score.replace(",", ".")) for score in df['biologi']]
df['kimia'] = [float(score.replace(",", ".")) for score in df['kimia']]
df['fisika'] = [float(score.replace(",", ".")) for score in df['fisika']]

In [75]:
# Generate clean dataset for training

df.to_csv("./data/data_train_clean.csv")

## TRAINING

In [67]:
# Preprocessing data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X = df[['mtk', 'biologi', 'fisika', 'kimia', 'minat_score', 'bakat_score', 'mapel_score']]
y = df['prodi']

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

print(le.classes_)

['administrasi publik' 'agribisnis' 'akuntansi' 'akuntansi manajemen'
 'analis kesehatan' 'bahasa inggris' 'biomedical engineering'
 'divteknik informatika' 'ekonomi islam' 'ekonomi pembangunan' 'farmasi'
 'fisika' 'fkip ekonomi' 'gizi' 'gizi dan dietetika' 'hub international'
 'hukum' 'hukum keluarga islam' 'ilmu dan teknologi pangan' 'ilmu hukum'
 'ilmu informasi dan perpustakaan' 'ilmu kelautan' 'ilmu komunikasi'
 'ilmu politik' 'jtd' 'kebidanan' 'kedokteran' 'kedokteran hewan'
 'keperawatan' 'keprawatan' 'kesmas' 'management business' 'manajemen'
 'manajemen bisnis' 'manajemen pemasaran' 'manajemen rekayasa konstruksi'
 'manajemen transportasi udara' 'matematika' 'nautika' 'pend dokter hewan'
 'pendidikan agama islam' 'pendidikan bahasa inggris' 'pendidikan biologi'
 'pendidikan dokter umum pdu' 'pendidikan guru sekolah dasar'
 'pendidikan kimia' 'pendidikan seni tari dan musik'
 'pendidikan teknik mesin' 'perbankan dan keuangan' 'perbankan syariah'
 'perpajakan' 'peternakan' 'pg p

In [68]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [69]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)
classifier.score(X_train, y_train)

1.0

In [70]:
classifier.n_features_in_

7

In [71]:
classifier.predict(X_test)

array([64, 64, 64, 64, 13, 22, 60, 64, 64, 64,  9, 58, 64,  2, 64, 64,  2,
       64, 64, 64, 60,  3, 14, 61,  2, 64,  1,  2, 64, 69, 28, 58, 60, 69,
       60,  1, 22,  2, 28, 61, 65, 14,  3,  2,  2, 64, 22,  2, 64, 56, 64,
       69, 58])

In [73]:
sum(classifier.predict(X_test) == y_test) / len(y_test)

0.1509433962264151