In [None]:
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
from matplotlib import cm # Colomaps
import seaborn as sns
from sklearn import tree
import plotly.express as px

# Classifier algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#train test split
from sklearn.model_selection import train_test_split

# Model evaluation
from sklearn import metrics

# Load Data

In [None]:
file_name = 'https://raw.githubusercontent.com/ArunaAbesekara/MachineLearning/main/students_adaptability_level_online_education.csv'

# Load CSV File
data = pd.read_csv(file_name)
data.sample(5)

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
292,Girl,1-5,School,Non Government,No,Yes,Low,Mid,Mobile Data,4G,1-3,No,Mobile,Moderate
280,Boy,26-30,University,Government,No,No,Low,Poor,Mobile Data,4G,0,No,Mobile,Low
974,Boy,21-25,University,Government,No,Yes,Low,Mid,Mobile Data,3G,1-3,Yes,Mobile,Low
913,Girl,1-5,School,Non Government,No,Yes,Low,Poor,Mobile Data,4G,1-3,No,Mobile,Moderate
744,Girl,16-20,College,Government,No,No,High,Mid,Mobile Data,3G,1-3,No,Mobile,Moderate


# Inspect Data

In [None]:
data.shape

(1205, 14)

In [None]:
data.dtypes

Gender                 object
Age                    object
Education Level        object
Institution Type       object
IT Student             object
Location               object
Load-shedding          object
Financial Condition    object
Internet Type          object
Network Type           object
Class Duration         object
Self Lms               object
Device                 object
Adaptivity Level       object
dtype: object

In [None]:
data.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Gender,1205.0,2.0,Boy,663.0,,,,,,,
Age,1205.0,6.0,21-25,374.0,,,,,,,
Education Level,1205.0,3.0,School,530.0,,,,,,,
Institution Type,1205.0,2.0,Non Government,823.0,,,,,,,
IT Student,1205.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Location,1205.0,2.0,Yes,935.0,,,,,,,
Load-shedding,1205.0,2.0,Low,1004.0,,,,,,,
Financial Condition,1205.0,3.0,Mid,878.0,,,,,,,
Internet Type,1205.0,2.0,Mobile Data,695.0,,,,,,,
Network Type,1205.0,3.0,4G,775.0,,,,,,,


In [None]:
#Check for missing fields
data.isnull().sum()

Gender                 0
Age                    0
Education Level        0
Institution Type       0
IT Student             0
Location               0
Load-shedding          0
Financial Condition    0
Internet Type          0
Network Type           0
Class Duration         0
Self Lms               0
Device                 0
Adaptivity Level       0
dtype: int64

## Data inspection

In [None]:
data['weight'] =1

In [None]:
fig = px.bar(data, x="Gender", y="weight", color="Adaptivity Level", title="Does Gender increase adaptivity of the students?")
fig.show()

In [None]:
fig = px.bar(data, x="Age", y="weight", color="Adaptivity Level", title="Does Age increase adaptivity of the students?")
fig.show()

In [None]:
fig = px.bar(data, x="Education Level", y="weight", color="Adaptivity Level", title="Does Education Level increase adaptivity of the students?")
fig.show()

In [None]:
fig = px.bar(data, x="Institution Type", y="weight", color="Adaptivity Level", title="Does Institution Type increase adaptivity of the students?")
fig.show()

In [None]:
fig = px.bar(data, x="IT Student", y="weight", color="Adaptivity Level", title="Does IT Student increase adaptivity of the students?")
fig.show()

In [None]:
fig = px.bar(data, x="Financial Condition", y="weight", color="Adaptivity Level", title="Does Financial Condition increase adaptivity of the students?")
fig.show()
 

In [None]:
fig = px.bar(data, x="Class Duration", y="weight", color="Adaptivity Level", title="Does Class Duration increase adaptivity of the students?")
fig.show()


# Data Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
#data['Gender'].sample(5)
LE = LabelEncoder()
data['Gender_code'] = LE.fit_transform(data['Gender'])
data['Age_code'] = LE.fit_transform(data['Age'])
data['Education_Level_code'] = LE.fit_transform(data['Education Level'])
data['Institution_code'] = LE.fit_transform(data['Institution Type'])
#data['IT Student'] = np.where(data['IT Student']== 'Yes',1,0)
data['IT_code'] = np.where(data['IT Student']== 'Yes',1,0)
data['in_town'] = np.where(data['Location']== 'Yes',1,0)
data['Financial_ID'] = LE.fit_transform(data['Financial Condition'])
data['Internet_code'] = LE.fit_transform(data['Internet Type'])
data['Network_code'] = LE.fit_transform(data['Network Type'])
data['duration_code'] = LE.fit_transform(data['Class Duration'])
data['Self Lms'] = np.where(data['Self Lms']== 'Yes',1,0)
data['Device_type'] = LE.fit_transform(data['Device'])
data['Adaptivity'] = LE.fit_transform(data['Adaptivity Level'])




In [None]:
data.dtypes

Gender                  object
Age                     object
Education Level         object
Institution Type        object
IT Student               int64
Location                object
Load-shedding           object
Financial Condition     object
Internet Type           object
Network Type            object
Class Duration          object
Self Lms                 int64
Device                  object
Adaptivity Level        object
Gender_code              int64
Age_code                 int64
Education_Level_code     int64
Institution_code         int64
in_town                  int64
Financial_ID             int64
Internet_code            int64
Network_code             int64
duration_code            int64
Device_type              int64
Adaptivity               int64
dtype: object

In [None]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Self Lms,1205.0,0.174274,0.379502,0.0,0.0,0.0,0.0,1.0
weight,1205.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Gender_code,1205.0,0.449793,0.497679,0.0,0.0,0.0,1.0,1.0
Age_code,1205.0,2.122822,1.210359,0.0,1.0,2.0,3.0,5.0
Education_Level_code,1205.0,1.19668,0.722437,0.0,1.0,1.0,2.0,2.0
Institution_code,1205.0,0.682988,0.465506,0.0,0.0,1.0,1.0,1.0
IT_code,1205.0,0.252282,0.434503,0.0,0.0,0.0,1.0,1.0
in_town,1205.0,0.775934,0.417139,0.0,1.0,1.0,1.0,1.0
Financial_ID,1205.0,0.341909,0.605302,0.0,0.0,0.0,1.0,2.0
Internet_code,1205.0,0.423237,0.494277,0.0,0.0,0.0,1.0,1.0


In [None]:
data.columns

Index(['Gender', 'Age', 'Education Level', 'Institution Type', 'IT Student',
       'Location', 'Load-shedding', 'Financial Condition', 'Internet Type',
       'Network Type', 'Class Duration', 'Self Lms', 'Device',
       'Adaptivity Level', 'weight', 'Gender_code', 'Age_code',
       'Education_Level_code', 'Institution_code', 'IT_code', 'in_town',
       'Financial_ID', 'Internet_code', 'Network_code', 'duration_code',
       'Device_type', 'Adaptivity'],
      dtype='object')

In [None]:
df = data[['Gender_code','Age_code','Education_Level_code','Institution_code','IT_code','in_town','Financial_ID','duration_code','Device_type','Self Lms','Adaptivity']]

In [None]:
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,Gender_code,Age_code,Education_Level_code,Institution_code,IT_code,in_town,Financial_ID,duration_code,Device_type,Self Lms,Adaptivity
Gender_code,1.0,-0.130395,-0.197743,0.10691,-0.233284,-0.034231,-0.01741,-0.035407,0.140691,-0.138331,0.033426
Age_code,-0.130395,1.0,0.414985,-0.216818,0.320066,-0.187269,0.028792,0.065048,-0.171172,0.246289,-0.230317
Education_Level_code,-0.197743,0.414985,1.0,0.116403,0.415967,0.063676,0.083509,0.165127,-0.278548,0.326259,-0.032694
Institution_code,0.10691,-0.216818,0.116403,1.0,0.104185,0.326813,0.013586,0.348251,-0.096857,0.096722,0.162387
IT_code,-0.233284,0.320066,0.415967,0.104185,1.0,0.009697,-0.025075,0.228662,-0.386776,0.453429,0.049282
in_town,-0.034231,-0.187269,0.063676,0.326813,0.009697,1.0,0.056958,0.216953,-0.070397,-0.036443,0.107414
Financial_ID,-0.01741,0.028792,0.083509,0.013586,-0.025075,0.056958,1.0,0.023791,-0.020968,-0.064362,-0.274643
duration_code,-0.035407,0.065048,0.165127,0.348251,0.228662,0.216953,0.023791,1.0,-0.133096,0.247624,0.222075
Device_type,0.140691,-0.171172,-0.278548,-0.096857,-0.386776,-0.070397,-0.020968,-0.133096,1.0,-0.210852,0.065729
Self Lms,-0.138331,0.246289,0.326259,0.096722,0.453429,-0.036443,-0.064362,0.247624,-0.210852,1.0,0.022179


In [None]:
correlation_matrix.loc[:,'Adaptivity']

Gender_code             0.033426
Age_code               -0.230317
Education_Level_code   -0.032694
Institution_code        0.162387
IT_code                 0.049282
in_town                 0.107414
Financial_ID           -0.274643
duration_code           0.222075
Device_type             0.065729
Self Lms                0.022179
Adaptivity              1.000000
Name: Adaptivity, dtype: float64

In [None]:
X_variables = ['Age_code',  'Financial_ID', 'duration_code','in_town']
y_varibale = 'Adaptivity'

X =df[X_variables].values
y= df[y_varibale].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(F"Train sample size = {len(X_train)}")
print(F"Test sample size  = {len(X_test)}")

Train sample size = 843
Test sample size  = 362


# Logistic Regression Model

In [None]:
model = LogisticRegression()

In [None]:
# Train Model
model.fit(X_train, y_train)

LogisticRegression()

In [None]:
# Predict on Testing Data

# probabilities
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Prediction
y_pred = model.predict(X_test)

# Results table
test_result = pd.DataFrame(data={'y_act':y_test, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})
test_result.sample(5)

Unnamed: 0,y_act,y_pred,y_pred_prob
198,2,2,0.119781
235,1,1,0.885886
238,1,1,0.402516
233,1,1,0.823789
335,1,2,0.119781


In [None]:
cfm = pd.crosstab(test_result['y_act'], test_result['y_pred'], margins=True)
cfm

y_pred,0,1,2,All
y_act,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,14,4,21,39
1,1,79,74,154
2,6,21,142,169
All,21,104,237,362


In [None]:
acuracy_lgr = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
acuracy_lgr

0.649171270718232

In [None]:
precision_lgr = metrics.precision_score(test_result['y_act'], test_result['y_pred'], average='weighted', pos_label=1)
precision_lgr

0.6746910309309778

In [None]:
f1_score_lgr = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average='weighted')  
f1_score_lgr

0.637366923447107

# Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier(random_state=0, max_depth=10, min_samples_split=20)

In [None]:
# Train Model
model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10, min_samples_split=20, random_state=0)

In [None]:
# probabilities
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Prediction
y_pred = model.predict(X_test)

# Results table
test_result = pd.DataFrame(data={'y_act':y_test, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})
test_result.sample(5)

Unnamed: 0,y_act,y_pred,y_pred_prob
104,1,1,1.0
75,2,2,0.113636
45,2,1,0.525773
353,2,2,0.0
245,1,2,0.113636


In [None]:
cfm = pd.crosstab(test_result['y_act'], test_result['y_pred'], margins=True)
cfm

y_pred,0,1,2,All
y_act,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,19,0,20,39
1,4,92,58,154
2,5,32,132,169
All,28,124,210,362


In [None]:
acuracy_dt = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
acuracy_dt

0.6712707182320442

In [None]:
precision_dt = metrics.precision_score(test_result['y_act'], test_result['y_pred'], average='weighted', pos_label=1)
precision_dt

0.6821848664612877

In [None]:
f1_score_dt = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average='weighted')  
f1_score_dt

0.6678667877395655

# Random Forest Classifier

In [None]:
model = RandomForestClassifier(n_estimators=500)

In [None]:
# Train Model
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500)

In [None]:
# probabilities
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Prediction
y_pred = model.predict(X_test)

# Results table
test_result = pd.DataFrame(data={'y_act':y_test, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})
test_result.sample(5)

Unnamed: 0,y_act,y_pred,y_pred_prob
16,1,2,0.112864
336,2,1,0.526617
208,1,1,1.0
355,1,1,1.0
240,2,0,0.186085


In [None]:
acuracy_rf = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
acuracy_rf

0.6850828729281768

In [None]:
precision_rf = metrics.precision_score(test_result['y_act'], test_result['y_pred'], average='weighted', pos_label=1)
precision_rf

0.6926449487277669

In [None]:
f1_score_rf = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average='weighted')  
f1_score_rf

0.6829984616750125

# Compare and Select best model

In [None]:
metrics_comp = [['LR', acuracy_lgr, precision_lgr, f1_score_lgr],
['DT', acuracy_dt, precision_dt, f1_score_dt],
['RF', acuracy_rf, precision_rf, f1_score_rf]]

pd.DataFrame(data=metrics_comp, columns=['Model', 'accuracy', 'precision', 'f1_score'])

Unnamed: 0,Model,accuracy,precision,f1_score
0,LR,0.649171,0.674691,0.637367
1,DT,0.671271,0.682185,0.667867
2,RF,0.685083,0.692645,0.682998


# Save best Model 

In [None]:
import joblib
# model contains the recently created RF model.
save_file = 'Online_Adopting_rf_model.joblib'
joblib.dump(model, open(save_file, 'wb'))
