<a href="https://colab.research.google.com/github/Ethioware/ML/blob/main/Motivation_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler as ss
from imblearn.over_sampling import RandomOverSampler
import copy
import seaborn as sns

## **Data Pre-processing**

In [None]:
#import the raw data (the first research)
df = pd.read_csv("data.csv")
# dropping unnessacerry columns
df = df.dropna(thresh=3) # drop the unwilling participants
df = df.drop(['barriers_to_use','likely_to_use'], axis=1)
#Binary numerical conversion
df['lesson_utility'] = (df['lesson_utility'] == "Yes").astype(int)  # convert the Yes/No columns to 0/1
df['use_edtech'] = (df['use_edtech'] == "Yes").astype(int)
df['public'] = (df['public'] == "Public (government)").astype(int)
df['n_smartphones'] = (df['n_smartphones'] == "more than 3").astype(int) # 1 if 3 and above
df['Why_use'] = (df['Why_use'] == "Improve my academic performance").astype(int) # for grades
df['app_demographic'] = (df['app_demographic'] == "Social media").astype(int) # for social media
# filling N/A datapoints with the mean
df = df.fillna(df.mean())
df = df.round(0)

## **Data Processing**

**Oversample**

In [None]:
# Visualize the parameters
# data = df.corr()
# data = data.round(2)
# fig, ax = plt.subplots(figsize=(12,8))
# sns.heatmap(data,annot=True,ax=ax)

In [None]:
# oversample for more generalization and performance
def scale(dataframe, oversample=False):
    X = dataframe[dataframe.columns[1:]].values # all the columns after "motivated"
    y = dataframe[dataframe.columns[0]].values # the "motivation" column
    scaler = ss() # assign the variable scaler to StandardScaler library

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y) # keep resampling until the "lesser sample" matches the larger one

    X = scaler.fit_transform(X) # makes it scalable
    data = np.hstack((X, np.reshape(y, (-1, 1)))) # new form or data stacked horizontally(hstack) with proper dimentions(reshape y to 2D)
    return data, X, y

**Train**

In [None]:
# split the dataset to train, validate, and test
train, val, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
# oversample the sectioned dataset
train, X_train, y_train = scale(train, oversample = True) # inflate the traing datapoints
val, X_val, y_val = scale(val, oversample=False) # don't tinker(False), we need to validate and test it with our training data only
test, X_test, y_test = scale(test, oversample=False)

In [None]:
 # visualize the accuracy during training

def acc(data):
    plt.plot(data.history['accuracy'], label = 'accuracy')
    plt.plot(data.history['val_accuracy'], label = 'val_accuracy')
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.show()

    # Neural Network
import tensorflow as tf

nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape = (15,)), # initial layer with all 16 features as input
    tf.keras.layers.Dropout(0.2), # to avoid overfitting and improve generalization
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),# second layer
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid') # output layer with only one node
])

nn_model.compile(optimizer = tf.keras.optimizers.Adam(0.01), loss = 'mean_squared_error', metrics=['accuracy'])

history = nn_model.fit(X_train, y_train, epochs= 100, batch_size=32, validation_split= 0.2, verbose='0')
acc(history) # visualizing the above neural net training process
print(cr(y_test, y_pred)) # a report telling us how our test and prediction compare in accuracy and other metrics

In [None]:
    # Support Vector Machines

from sklearn.svm import SVC
from sklearn.metrics import classification_report as cr

svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train) # train our model using (fitting) Support Vector Machines algorithm

y_pred = svm_model.predict(X_test) # assign our tested prediction to a variable "y_pred"
print(cr(y_test, y_pred)) # a report telling us how our test and prediction compare in accuracy and other metrics

              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         1
         2.0       0.00      0.00      0.00         5
         3.0       0.07      0.25      0.11         4
         4.0       0.58      0.54      0.56        13
         5.0       0.60      0.30      0.40        10

    accuracy                           0.33        33
   macro avg       0.25      0.22      0.21        33
weighted avg       0.42      0.33      0.35        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### K-Nearest Neighbors

In [None]:
# use various algoritms and keep the most accurate one with manupilating its parameters to get the best performance

    # K-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.metrics import classification_report as cr

knn_model = knn(n_neighbors= 8) # use n_neighbors features (used all the feature vectors)
knn_model.fit(X_train, y_train) # train our model using (fitting) Knn algorithm

y_pred = knn_model.predict(X_test) # assign our tested prediction to a variable "y_pred"
print(cr(y_test, y_pred)) # a report telling us how our test and prediction compare in accuracy and other metrics


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         1
         2.0       0.00      0.00      0.00         0
         3.0       0.60      0.25      0.35        12
         4.0       0.80      0.40      0.53        10
         5.0       0.67      0.40      0.50        10

    accuracy                           0.33        33
   macro avg       0.41      0.21      0.28        33
weighted avg       0.66      0.33      0.44        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Naive Bayes

In [None]:

    # Naive Bayes
from sklearn.naive_bayes import GaussianNB as nb
from sklearn.metrics import classification_report as cr

nb_model = nb()
nb_model = nb_model.fit(X_train, y_train) # train our model using (fitting) Naive Bayes algorithm

y_pred = nb_model.predict(X_test) # assign our tested prediction to a variable "y_pred"
print(cr(y_test, y_pred)) # a report telling us how our test and prediction compare in accuracy and other metrics


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         1
         2.0       0.00      0.00      0.00         0
         3.0       0.64      0.75      0.69        12
         4.0       0.43      0.30      0.35        10
         5.0       0.60      0.30      0.40        10

    accuracy                           0.45        33
   macro avg       0.33      0.27      0.29        33
weighted avg       0.55      0.45      0.48        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Logistic Regression

In [None]:
   # Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report as cr

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train) # train our model using (fitting) Logistic Regression algorithm

y_pred = lg_model.predict(X_test) # assign our tested prediction to a variable "y_pred"
print(cr(y_test, y_pred)) # a report telling us how our test and prediction compare in accuracy and other metrics


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         1
         2.0       0.00      0.00      0.00         0
         3.0       0.54      0.58      0.56        12
         4.0       0.50      0.10      0.17        10
         5.0       0.43      0.30      0.35        10

    accuracy                           0.33        33
   macro avg       0.29      0.20      0.22        33
weighted avg       0.48      0.33      0.36        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Support Vector Machines

### Neural Network

### Linear Regression

In [None]:
    # Linear Regression
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model = lr_model.fit(X_train, y_train) # train our model using (fitting) Linear Regression algorithm

lr_model.score(X_test, y_test) # accuracy of our model
#acc(lr_model) # visualizing the above LinearRegression training process

-0.8790271081167174