In [2]:
# Import modules
# Pickle
import pickle
# Numpy, Pandas
import numpy as np
import pandas as pd
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
# get data
df = pd.read_csv("data/students_adaptability_level_online_education.csv")
df_clean = pd.read_csv("data/clean.csv")

# This features (Load-shedding, Self Lms, Instutition Type) can't understand the meaning. 
# Delete feature('Education Level','Device','IT Student')
df = df.drop(['Load-shedding','Self Lms','Institution Type','Education Level','Device','IT Student'], axis=1)

# This feature (Unnamed: 0) are unnecessary. Delete feature(Unnamed: 0)
df = df.drop(['Unnamed: 0'],axis=1)
    
# Delete if the record contains a NaN value.
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

# Except for academic background that is not fit for our purposes (Age=['1-5','21-25','26-30'])
df = df[df['Age']!='01-05']
df = df[df['Age']!='21-25']
df = df[df['Age']!='26-30']

# Reset index
df.reset_index(drop=True, inplace=True)

In [4]:
# Function : preprocessing for exist data
# Input : raw data / Output : processed data
def preprocess(datas):
    
    # Label Encoding
    ldf = datas.copy()
    le = LabelEncoder()
    for feature in list(df.columns):
        ldf[feature]=le.fit_transform(df[feature].values)
 
    # Scaling -> StandardScaler
    X_ldf=ldf.drop(['Adaptivity Level'], axis=1)
    scaled_df = StandardScaler().fit_transform(X_ldf)
    scaled_df = pd.DataFrame(scaled_df,columns=list(X_ldf.columns))

    # Make complete scaled dataframe
    scaled_df = pd.concat([scaled_df,ldf['Adaptivity Level']], axis=1)
    
    return scaled_df


In [5]:
# Function : preprocessing for input data
# Input : raw data / Output : processed data
def preprocess_input(data):

    # Make DataFrame
    data = pd.DataFrame(data, 
                        columns=['Gender','Age','Location','Financial Condition','Internet Type','Network Type','Class Duration'],)
    
    # Copy clean data
    df_clean_copy = df_clean.copy()

    # Add user data into dataset
    X_df = df_clean_copy.drop(columns=['Education Level','IT Student','Device','Adaptivity Level'])
    df = X_df.to_numpy()
    df = np.append(df,data, axis=0)

    # Encoding & Scaling
    le = LabelEncoder()
    org_df = pd.DataFrame(df,columns=['Gender','Age','Location','Financial Condition','Internet Type','Network Type','Class Duration'])
    for feature in ['Gender','Age','Location','Financial Condition','Internet Type','Network Type','Class Duration']:
        org_df[feature]=le.fit_transform(org_df[feature].values)
    scaler = StandardScaler()
    df = scaler.fit_transform(org_df)

    # Get user input
    user = df[-1]
    
    return user


In [6]:
# preprocessing
scaled_df = preprocess(df)

In [7]:
# Split X,y
X = scaled_df.drop(columns=['Adaptivity Level']).values
y = scaled_df['Adaptivity Level'].values

# Split the dataset 75% for training and 25% for testing
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.75,test_size=0.25,shuffle=True)

In [8]:
# Function : model evaluation (classification report)
def model_evaluate(model):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [9]:
# Pipeline
from sklearn.pipeline import Pipeline

# Split X,y
X = scaled_df.drop(columns=['Adaptivity Level']).values
y = scaled_df['Adaptivity Level'].values

# Make pipeline (scaler, model) & fit pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('clf',tree.DecisionTreeClassifier(criterion='entropy'))])
pipe.fit(X_train, y_train)

model_evaluate(pipe)

              precision    recall  f1-score   support

           0       0.83      0.56      0.67       194
           1       0.83      0.80      0.82      1125
           2       0.81      0.87      0.84      1424

    accuracy                           0.82      2743
   macro avg       0.82      0.74      0.78      2743
weighted avg       0.82      0.82      0.82      2743



In [10]:
# Save pipeline.pickle
with open('pipeline.pickle','wb') as f:
    pickle.dump(pipe,f)
    
# Load pipeline.pickle
with open('pipeline.pickle','rb') as f:
    loaded_pipe = pickle.load(f)

# Check loaded pipeline.pickle
model_evaluate(loaded_pipe)

              precision    recall  f1-score   support

           0       0.83      0.56      0.67       194
           1       0.83      0.80      0.82      1125
           2       0.81      0.87      0.84      1424

    accuracy                           0.82      2743
   macro avg       0.82      0.74      0.78      2743
weighted avg       0.82      0.82      0.82      2743



In [11]:
# Function : predict result
def predict(model, data):
    # Make list into ndarray
    data = [data.split(',')]
    # Preprocessing
    preprocessed_df = preprocess_input(data)
    predictions = model.predict(preprocessed_df.reshape(1,-1))
    
    pred_to_label = {0: 'High', 1: 'Low', 2: 'Moderate'}

    # Make a list of result
    result = []
    for pred in predictions:
        result.append({'pred:': int(pred), 'label': pred_to_label[pred]})
    
    return result

# Test
if __name__=="__main__":
    data = "Boy,11-15,No,Rich,Mobile Data,4G,03-06"
    predictions = predict(loaded_pipe, data)
    print(predictions)

[{'pred:': 0, 'label': 'High'}]
