<h2 align="center">PREPROCESSING DATA</h2>

1. untuk data x menggunakan data yang bersifat numerik dan kategorik 
2. dengan menggunakan data Y berupa data kelulusan (numerik) yaitu lama masa studi mahasiswa dengan jumlah bulan tepat waktu (<= 48 bulan) dan tidak tepat waktu(>48 bulan)


In [None]:
# Part 1 - Data Preprocessing

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
import seaborn as sns

# CVS Column Names
col_names = ['id_training','fakultas', 'program_studi', 'nim', 'nama', 'tahun_angkatan', 'jenis_kelamin', 'ips1', 'ips2', 'ips3', 'ips4', 'ips5', 'ips6', 'ipk', 'sumber_biaya', 'ket_jalur_masuk', 'sekolah_asal','lama_studi', 'status_kelulusan']
             
#Import file data mahasiswa
csvData = pd.read_csv('DATA_TRAINING_ALUMNI_LAMASTUDI 2014 - 2016 M.csv',names=col_names)
print(csvData)

In [None]:
#integrasi data
csvData.drop(columns=['id_training','nama','fakultas','program_studi','nim','tahun_angkatan','status_kelulusan'], axis = 0, inplace = True)

print(csvData)

In [None]:
#TRANSFORMASI DATA
#mengubah kategori ke numerik 
data2 = ['sumber_biaya']
def biaya(x):
    return x.map({"Reguler": 1, "Bidikmisi": 0})

csvData[data2] = csvData[data2].apply(biaya)

data3 = ['jenis_kelamin']
def jeniskelamin(x):
    return x.map({"Perempuan": 1, "Laki-Laki": 0})

csvData[data3] = csvData[data3].apply(jeniskelamin)

data4 = ['ket_jalur_masuk']
def jalurmasuk(x):
    return x.map({"SNMPTN": 0, "SBMPTN": 1, "SM": 2})

csvData[data4] = csvData[data4].apply(jalurmasuk)

data5 = ['sekolah_asal']
def asalsekolah(x):
    return x.map({"SMA": 0, "SMK": 1, "MA": 2})

csvData[data5] = csvData[data5].apply(asalsekolah)

#MERUBAH NILAI KOSONG(NaN) PADA ATRIBUT TABEL DATA  dengan mengganti menjadi nilai mean
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer = imputer.fit_transform(csvData.values.reshape(-1,1))[:,:]
#ket : 
# missing_values = nilai data yang digunakan sebagai penanda bahwa nilai asli tidak ada(missing); dalam hal ini Nan (np.nan)
# stategy = dalam hal ini rata - rata kolom ('mean'), bisa juga menggunakan 'median', 'most_frequent'(modus) atau 'constant'

X = csvData.iloc[:,:-1]
y = csvData.iloc[:,-1]

print(X)
print(y)
print(len(X))
print(X)
print(y)

In [None]:
# Membagi data mahasiswa untuk data training dan testing (80 : 20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

<h2 align="center">Multiple Linear Regression</h2>

In [None]:
# Data Mining - Membuat prediksi MLR! dengan library sklearn
from sklearn.linear_model import LinearRegression
model_mlr = LinearRegression()
model_mlr.fit(X_train, y_train)

# Hasil Prediksi dengan MLR
y_pred_mlr = model_mlr.predict(X_test)
print(y_pred_mlr)

<h2 align="center">Multiple Linear Regression</h2>

In [None]:
# Evaluasi (Pengujian Peramalan) MLR
from sklearn import metrics
print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(y_test, y_pred_mlr)*100)
print('Mean Squared Error:', metrics.mean_squared_error(y_pred_mlr,y_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr)))

In [None]:
#nilai koefisien (b) dan intercept(a)
print(model_mlr.coef_)
print(model_mlr.intercept_)

<h2 align="center">Artificial Neural Network</h2>

In [None]:
# Data Mining - Membuat prediksi ANN! dengan library sklearn
from sklearn.neural_network import MLPRegressor
mpl = MLPRegressor(hidden_layer_sizes=(10), activation='relu', solver='adam', learning_rate_init = 0.001, max_iter=1000, random_state=1).fit(X_train, y_train)

# Hasil Prediksi dengan ANN
y_pred_ann = mpl.predict(X_test)
print(y_pred_ann)

<h2 align="center">Artificial Neural Network</h2>

In [None]:
# Evaluasi (Pengujian Peramalan) ANN
from sklearn import metrics
print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(y_test, y_pred_ann)*100)
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_ann))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_ann)))

In [None]:
print('R2 score MLR:', np.sqrt(metrics.r2_score(y_test, y_pred_mlr)))
print('R2 score ANN:', np.sqrt(metrics.r2_score(y_test, y_pred_ann)))

<h2 align="center">PERBANDINGAN</h2>

In [None]:
# Perbandingan Hasil Nilai Prediksi MLR, ANN dan Dengan Data Actual (Asli)
df_ann = pd.DataFrame({'MLR':y_pred_mlr,'ANN':y_pred_ann,'Actual':y_test})
print(df_ann)

In [None]:
#Plot KDE Data Actual vs Hasil Prediksi MLR vs Hasil Prediksi ANN
sns.kdeplot(data= y_test, shade = True, label = "Data Actual")
sns.kdeplot(data= y_pred_mlr, shade = True, label = "Hasil Prediksi MLR")
sns.kdeplot(data=y_pred_ann, shade = True, label = "Hasil Prediksi ANN")
plt.legend()

In [None]:
#Import file data mahasiswa
plt.plot(y_pred_mlr)
plt.plot(y_pred_ann)
plt.xlabel('jumlah data')
plt.ylabel('lama studi')
plt.title('perbandingan hasil prediksi ann dan mlr')
plt.grid(True)
plt.legend(labels=('MLR','ANN'))
plt.show

In [None]:
# Import file data mahasiswa
data_actual = y_test
data_actual.to_csv("data_actual1.csv")
Data_actual = pd.read_csv('data_actual1.csv')
plt.plot(Data_actual['lama_studi'])
plt.plot(y_pred_mlr)
plt.plot(y_pred_ann)
plt.xlabel('jumlah data')
plt.ylabel('lama studi')
plt.title('perbandingan hasil prediksi actual, ann dan mlr')
plt.grid(True)
plt.legend(labels=('Actual','MLR','ANN'))
plt.show

<h2 align="center">RASIO 70:30</h2>

In [None]:
# Membagi data mahasiswa untuk data training dan testing (70 : 30)
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size = 0.3, random_state=1)
print(len(X_train1))
print(len(y_train1))
print(len(X_test1))
print(len(y_test1))

# Data Mining - Membuat prediksi MLR! dengan library sklearn
from sklearn.linear_model import LinearRegression
model_mlr = LinearRegression()
model_mlr.fit(X_train1, y_train1)

# Hasil Prediksi dengan MLR
y_pred_mlr1 = model_mlr.predict(X_test1)
print(y_pred_mlr1)

# Data Mining - Membuat prediksi ANN! dengan library sklearn
from sklearn.neural_network import MLPRegressor
mpl = MLPRegressor(hidden_layer_sizes=(10), activation='relu', solver='adam', learning_rate_init = 0.001, max_iter=1000, random_state=1).fit(X_train1, y_train1)

# Hasil Prediksi dengan ANN
y_pred_ann1 = mpl.predict(X_test1)
print(y_pred_ann1)

<h2 align="center">RASIO 60:40</h2>

In [None]:
# Membagi data mahasiswa untuk data training dan testing (60 : 40)
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size = 0.4, random_state=1)
print(len(X_train2))
print(len(y_train2))
print(len(X_test2))
print(len(y_test2))

# Data Mining - Membuat prediksi MLR! dengan library sklearn
from sklearn.linear_model import LinearRegression
model_mlr = LinearRegression()
model_mlr.fit(X_train2, y_train2)

# Hasil Prediksi dengan MLR
y_pred_mlr2 = model_mlr.predict(X_test2)
print(y_pred_mlr2)

# Data Mining - Membuat prediksi ANN! dengan library sklearn
from sklearn.neural_network import MLPRegressor
mpl = MLPRegressor(hidden_layer_sizes=(10), activation='relu', solver='adam', learning_rate_init = 0.001, max_iter=1000, random_state=1).fit(X_train2, y_train2)

# Hasil Prediksi dengan ANN
y_pred_ann2 = mpl.predict(X_test2)
print(y_pred_ann2)

<h2 align="center">RASIO 50:50</h2>

In [None]:
# Membagi data mahasiswa untuk data training dan testing (50 : 50)
from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size = 0.5, random_state=1)
print(len(X_train3))
print(len(y_train3))
print(len(X_test3))
print(len(y_test3))

# Data Mining - Membuat prediksi MLR! dengan library sklearn
from sklearn.linear_model import LinearRegression
model_mlr = LinearRegression()
model_mlr.fit(X_train3, y_train3)

# Hasil Prediksi dengan MLR
y_pred_mlr3 = model_mlr.predict(X_test3)
print(y_pred_mlr3)

# Data Mining - Membuat prediksi ANN! dengan library sklearn
from sklearn.neural_network import MLPRegressor
mpl = MLPRegressor(hidden_layer_sizes=(10), activation='relu', solver='adam', learning_rate_init = 0.001, max_iter=1000, random_state=1).fit(X_train3, y_train3)

# Hasil Prediksi dengan ANN
y_pred_ann3 = mpl.predict(X_test3)
print(y_pred_ann3)

<h2 align="center">PERBANDINGAN</h2>

In [None]:
# Evaluasi (Pengujian Peramalan) ANN
from sklearn import metrics
print('80:20 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test, y_pred_ann)*100)
print('70:30 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test1, y_pred_ann1)*100)
print('60:40 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test2, y_pred_ann2)*100)
print('50:50 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test3, y_pred_ann3)*100)

print('80:20 (Mean Squared Error):', metrics.mean_squared_error(y_test, y_pred_ann))
print('70:30 (Mean Squared Error):', metrics.mean_squared_error(y_test1, y_pred_ann1))
print('60:40 (Mean Squared Error):', metrics.mean_squared_error(y_test2, y_pred_ann2))
print('50:50 (Mean Squared Error):', metrics.mean_squared_error(y_test3, y_pred_ann3))

print('80:20 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_ann)))
print('70:30 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test1, y_pred_ann1)))
print('60:40 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test2, y_pred_ann2)))
print('50;50 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test3, y_pred_ann3)))

In [None]:
# Evaluasi (Pengujian Peramalan) MLR
from sklearn import metrics
print('80:20 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test, y_pred_mlr)*100)
print('70:30 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test1, y_pred_mlr1)*100)
print('60:40 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test2, y_pred_mlr2)*100)
print('50:50 (Mean Absolute Percentage Error):', metrics.mean_absolute_percentage_error(y_test3, y_pred_mlr3)*100)

print('80:20 (Mean Squared Error):', metrics.mean_squared_error(y_pred_mlr,y_test))
print('70:30 (Mean Squared Error):', metrics.mean_squared_error(y_pred_mlr1,y_test1))
print('60:40 (Mean Squared Error):', metrics.mean_squared_error(y_pred_mlr2,y_test2))
print('50:50 (Mean Squared Error):', metrics.mean_squared_error(y_pred_mlr3,y_test3))

print('80:20 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr)))
print('70:30 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test1, y_pred_mlr1)))
print('60:40 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test2, y_pred_mlr2)))
print('50:50 (Root Mean Squared Error):', np.sqrt(metrics.mean_squared_error(y_test3, y_pred_mlr3)))

<h2 align="center">Memuat dataset dan melakukan eksplorasi dasar untuk mendapatkan ide tentang dataset</h2>

In [None]:
print(csvData.head())
print(csvData.info())
print(csvData.shape)
print(csvData.isnull())
print(csvData.isnull().sum())
print(csvData.isna().sum())
print(csvData.describe())

In [None]:
#JUMLAH DATA
print(len(csvData))
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))
print(len(y_pred_mlr))
print(len(y_pred_ann))

### Exploratory Data Analysis

In [None]:
sns.distplot(csvData.jenis_kelamin)

In [None]:
sns.distplot(csvData.ips1)

In [None]:
sns.distplot(csvData.ips2)

In [None]:
sns.distplot(csvData.ips3);

In [None]:
sns.distplot(csvData.ips4)

In [None]:
sns.distplot(csvData.ips5)

In [None]:
sns.distplot(csvData.ips6)

In [None]:
sns.distplot(csvData.ipk)

In [None]:
sns.distplot(csvData.sumber_biaya)

In [None]:
sns.distplot(csvData.ket_jalur_masuk)

In [None]:
sns.distplot(csvData.sekolah_asal)

In [None]:
sns.distplot(csvData.lama_studi)

In [None]:
sns.pairplot(csvData, x_vars=['jenis_kelamin','ips1','ips2','ips3','ips4','ips5','ips6','ipk','sumber_biaya','ket_jalur_masuk','sekolah_asal'], y_vars='lama_studi', height=7, aspect=0.7);

In [None]:
csvData.corr()

In [None]:
sns.heatmap( csvData.corr(), annot=True )

In [None]:
print('R2 score MLR 80:20:', np.sqrt(metrics.r2_score(y_test, y_pred_mlr)))
print('R2 score ANN 80:20:', np.sqrt(metrics.r2_score(y_test, y_pred_ann)))
print('R2 score MLR 70:30:', np.sqrt(metrics.r2_score(y_test1, y_pred_mlr1)))
print('R2 score ANN 70:30:', np.sqrt(metrics.r2_score(y_test1, y_pred_ann1)))
print('R2 score MLR 60:40:', np.sqrt(metrics.r2_score(y_test2, y_pred_mlr2)))
print('R2 score ANN 60:40:', np.sqrt(metrics.r2_score(y_test2, y_pred_ann2)))
print('R2 score MLR 50:50:', np.sqrt(metrics.r2_score(y_test3, y_pred_mlr3)))
print('R2 score ANN 50:50:', np.sqrt(metrics.r2_score(y_test3, y_pred_ann3)))

In [None]:
print('Mean Absolute Error MLR 80:20:', metrics.mean_absolute_error(y_test, y_pred_mlr))
print('Mean Absolute Error ANN 80:20:', metrics.mean_absolute_error(y_test, y_pred_ann))
print('Mean Absolute Error MLR 70:30:', metrics.mean_absolute_error(y_test1, y_pred_mlr1))
print('Mean Absolute Error ANN 70:30:', metrics.mean_absolute_error(y_test1, y_pred_ann1))
print('Mean Absolute Error MLR 60:40:', metrics.mean_absolute_error(y_test2, y_pred_mlr2))
print('Mean Absolute Error ANN 60:40:', metrics.mean_absolute_error(y_test2, y_pred_ann2))
print('Mean Absolute Error MLR 50:50:', metrics.mean_absolute_error(y_test3, y_pred_mlr3))
print('Mean Absolute Error ANN 50:50:', metrics.mean_absolute_error(y_test3, y_pred_ann3))

In [None]:
print('explained_variance_score MLR 80:20:', metrics.explained_variance_score(y_test, y_pred_mlr))
print('max_error MLR 80:20:', metrics.max_error(y_test, y_pred_mlr))
print('mean_absolute_error MLR 80:20:', metrics.mean_absolute_error(y_test, y_pred_mlr))
print('mean_squared_error MLR 80:20:', metrics.mean_squared_error(y_test, y_pred_mlr))
print('mean_squared_log_error MLR 80:20:', metrics.mean_squared_log_error(y_test, y_pred_mlr))
print('median_absolute_error MLR 80:20:', metrics.median_absolute_error(y_test, y_pred_mlr))
print('mean_absolute_percentage_error MLR 80:20:', metrics.mean_absolute_percentage_error(y_test, y_pred_mlr))
print('R2 score MLR 80:20:', metrics.r2_score(y_test, y_pred_mlr))
print('mean_poisson_deviance MLR 80:20:', metrics.mean_poisson_deviance(y_test, y_pred_mlr))
print('mean_gamma_deviance MLR 80:20:', metrics.mean_gamma_deviance(y_test, y_pred_mlr))
print('mean_tweedie_deviance MLR 80:20:', metrics.mean_tweedie_deviance(y_test, y_pred_mlr))
print('d2_tweedie_score MLR 80:20:', metrics.d2_tweedie_score(y_test, y_pred_mlr))
print('mean_pinball_loss MLR 80:20:', metrics.mean_pinball_loss(y_test, y_pred_mlr))
print('d2_pinball_score MLR 80:20:', metrics.d2_pinball_score(y_test, y_pred_mlr))
print('d2_absolute_error_score MLR 80:20:', metrics.d2_absolute_error_score(y_test, y_pred_mlr))

In [None]:
print('explained_variance_score MLR 80:20:', metrics.explained_variance_score(y_test, y_pred_ann))
print('max_error MLR 80:20:', metrics.max_error(y_test, y_pred_ann))
print('mean_absolute_error MLR 80:20:', metrics.mean_absolute_error(y_test, y_pred_ann))
print('mean_squared_error MLR 80:20:', metrics.mean_squared_error(y_test, y_pred_ann))
print('mean_squared_log_error MLR 80:20:', metrics.mean_squared_log_error(y_test, y_pred_ann))
print('median_absolute_error MLR 80:20:', metrics.median_absolute_error(y_test, y_pred_ann))
print('mean_absolute_percentage_error MLR 80:20:', metrics.mean_absolute_percentage_error(y_test, y_pred_ann))
print('R2 score MLR 80:20:', metrics.r2_score(y_test, y_pred_ann))
print('mean_poisson_deviance MLR 80:20:', metrics.mean_poisson_deviance(y_test, y_pred_ann))
print('mean_gamma_deviance MLR 80:20:', metrics.mean_gamma_deviance(y_test, y_pred_ann))
print('mean_tweedie_deviance MLR 80:20:', metrics.mean_tweedie_deviance(y_test, y_pred_ann))
print('d2_tweedie_score MLR 80:20:', metrics.d2_tweedie_score(y_test, y_pred_ann))
print('mean_pinball_loss MLR 80:20:', metrics.mean_pinball_loss(y_test, y_pred_ann))
print('d2_pinball_score MLR 80:20:', metrics.d2_pinball_score(y_test, y_pred_ann))
print('d2_absolute_error_score MLR 80:20:', metrics.d2_absolute_error_score(y_test, y_pred_ann))