In [6]:
pip install scikit-mlm

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install --upgrade scikit-learn




In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import re

In [25]:
# Load your dataset
data = pd.read_csv('dataset.csv',low_memory=False)

# Removing anonymous and indicative features
anonymous_indicative_features = ['course_id_DI', 'userid_DI']
data.drop(columns=anonymous_indicative_features, inplace=True)

# Handling missing values before conversion
data.dropna(subset=['expected_hours_week'], inplace=True)

# Convert 'expected_hours_week' to numeric
def extract_numeric_hours(value):
    # Extract numeric information from the string
    numeric_value = re.findall(r'\d+', value)
    if numeric_value:
        return float(numeric_value[0])
    else:
        return None

data['expected_hours_week'] = data['expected_hours_week'].apply(extract_numeric_hours)

# Drop rows with missing values after conversion
data.dropna(inplace=True)

# Encoding nominal features
nominal_features = ['discipline', 'primary_reason', 'learner_type', 'LoE_DI']
for feature in nominal_features:
    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])

# Normalizing features
features_to_normalize = ['expected_hours_week', 'course_length', 'nevents', 'ndays_act', 'ncontent', 'nforum_posts']
scaler = MinMaxScaler()
data[features_to_normalize] = scaler.fit_transform(data[features_to_normalize])

# Features to keep
X = data[['discipline', 'registered', 'explored', 'grade', 'grade_reqs', 'completed_%', 
          'course_reqs', 'primary_reason', 'learner_type', 'expected_hours_week', 
          'LoE_DI','nevents', 'ndays_act', 
          'ncontent', 'nforum_posts', 'course_length']]


# Set 'viewed' as the target value
y = data['viewed']
data.drop(columns=['viewed'], inplace=True)

In [26]:
data

Unnamed: 0,discipline,registered,explored,grade,grade_reqs,completed_%,course_reqs,primary_reason,learner_type,expected_hours_week,LoE_DI,nevents,ndays_act,ncontent,nforum_posts,course_length
24,1,1,0,0.000,1,0.067,1,5,1,0.142857,0,0.008684,0.142857,0.311828,0.015873,0.003448
26,5,1,0,0.828,1,0.158,1,5,6,0.000000,0,0.009037,0.142857,1.000000,0.000000,0.100000
27,1,1,0,0.000,1,0.027,1,5,4,0.142857,0,0.001763,0.032967,0.075269,0.031746,0.003448
29,1,1,0,0.000,1,0.013,1,4,2,0.142857,0,0.001670,0.010989,0.000000,0.000000,0.003448
31,1,1,0,0.000,1,0.067,1,4,1,0.142857,0,0.005307,0.043956,0.311828,0.000000,0.003448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324108,5,1,0,0.547,1,0.083,1,4,0,0.428571,1,0.001763,0.054945,1.000000,0.047619,0.020690
324259,5,1,0,0.000,1,0.032,1,4,1,0.142857,3,0.001058,0.021978,0.526882,0.000000,0.100000
324378,5,1,0,0.168,1,0.035,1,4,1,0.000000,3,0.002820,0.065934,0.182796,0.111111,0.244828
324901,5,1,0,0.475,1,0.083,1,10,2,0.000000,4,0.005752,0.032967,1.000000,0.000000,0.020690


In [27]:
from sklearn.model_selection import train_test_split
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [28]:
#linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# If you want to print the coefficients and intercept:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)


Mean Squared Error: 3.642380935398059e-05
R-squared: 0.0
Coefficients: [-6.92048159e-05  1.30104261e-16  4.07441108e-04  1.59005890e-03
 -2.58473798e-16  5.00872383e-03 -8.04478012e-17  2.38057907e-04
 -8.26288745e-04 -2.80760760e-03 -1.28203672e-04  7.74196909e-02
  1.04293089e-02 -5.41568063e-03  6.21452692e-04  2.47414235e-03]
Intercept: 1.0017190541553844


In [29]:
#logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the logistic regression model
logistic_model = LogisticRegression()

# Fit the model on the training data
logistic_model.fit(X_train, y_train)

# Predict on the testing data
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_logistic)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))

# If you want to print the coefficients and intercept:
print("Coefficients:", logistic_model.coef_)
print("Intercept:", logistic_model.intercept_)


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       278

    accuracy                           1.00       278
   macro avg       1.00      1.00      1.00       278
weighted avg       1.00      1.00      1.00       278

Confusion Matrix:
[[278]]
Coefficients: [[-0.56457904  0.01860456  0.02927537 -0.02533437  0.01860456  0.01689584
   0.01860456  0.14930201 -0.51896753 -0.20545744 -0.0549134   0.00611368
   0.08326588 -0.2542707   0.0391464   0.02097162]]
Intercept: [10.27656256]




In [33]:
#decision tree regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Decision Tree Regressor model
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
decision_tree_model.fit(X_train, y_train)

# Predict on the testing data
y_pred_decision_tree = decision_tree_model.predict(X_test)

# Evaluate the model
mse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree)
r2_decision_tree = r2_score(y_test, y_pred_decision_tree)

print("Decision Tree Regressor - Mean Squared Error:", mse_decision_tree)
print("Decision Tree Regressor - R-squared:", r2_decision_tree)


Decision Tree Regressor - Mean Squared Error: 0.0
Decision Tree Regressor - R-squared: 1.0
