In [None]:
!pip install Faker # Install the faker library

import numpy as np
import pandas as pd
from faker import Faker # Import the Faker class after installation
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Collecting Faker
  Downloading Faker-30.3.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.3.0-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.8 MB[0m [31m29.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m40.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-30.3.0


**STEP 1 : GENERATE SYNTHETIC DATA USING FAKER **

In [None]:
# Initialize Faker
fake = Faker()

# Define available options
courses = ['Computer Science', 'Mechanical Engineering', 'Civil Engineering', 'Electrical Engineering']
interests = ['AI', 'Blockchain', 'Environmental Science', 'Robotics', 'Data Science']
difficulty_levels = ['Easy', 'Medium', 'Hard']

In [None]:
# Generate Student Data
student_data = []
for _ in range(100):  # 100 students
    student_data.append({
        'student_id': fake.unique.random_int(min=1, max=1000),
        'name': fake.name(),
        'course': random.choice(courses),
        'year': random.randint(1, 4),
        'interests': random.choice(interests),
        'average_quiz_score': random.uniform(50, 100)
    })

student_df = pd.DataFrame(student_data)
student_df.head()

Unnamed: 0,student_id,name,course,year,interests,average_quiz_score
0,102,Joshua Obrien,Electrical Engineering,4,Data Science,73.556284
1,787,Jamie Stewart,Electrical Engineering,3,Environmental Science,83.224891
2,398,Randy Madden,Civil Engineering,2,Blockchain,67.466833
3,745,Terri Crawford,Mechanical Engineering,3,AI,99.76831
4,55,Steven Olson,Civil Engineering,2,Blockchain,67.971398


In [None]:
# Generate Study Material Data
material_data = []
for _ in range(50):  # 50 study materials
    material_data.append({
        'material_id': fake.unique.random_int(min=1, max=500),
        'title': fake.sentence(nb_words=5),
        'subject': random.choice(interests),
        'difficulty_level': random.choice(difficulty_levels),
        'popularity_score': random.uniform(1, 100)
    })

material_df = pd.DataFrame(material_data)
material_df.head()

Unnamed: 0,material_id,title,subject,difficulty_level,popularity_score
0,234,Parent candidate suddenly.,Data Science,Medium,82.7022
1,279,Surface somebody middle water crime.,Environmental Science,Hard,69.546209
2,368,Newspaper window every politics.,Robotics,Easy,47.949286
3,169,Check fine voice.,Environmental Science,Easy,27.113816
4,348,Discussion set it culture car claim.,Data Science,Easy,67.480794


In [None]:
# Generate Engagement Data
engagement_data = []
for _ in range(200):  # Random engagements
    engagement_data.append({
        'student_id': random.choice(student_df['student_id']),
        'material_id': random.choice(material_df['material_id']),
        'logins': random.randint(1, 10),  # Number of logins
        'videos_watched': random.randint(1, 5),
        'time_spent': random.uniform(10, 120),  # Time spent on platform in minutes
        'quiz_score': random.uniform(50, 100),  # Quiz score for the material
        'completed':  random.choice([0, 1]),  # Whether the student completed the material
        'rating': random.randint(1, 5)  # Rating from 1 to 5 (we'll use it as a proxy for engagement)
    })



engagement_df = pd.DataFrame(engagement_data)
engagement_df.head()

Unnamed: 0,student_id,material_id,logins,videos_watched,time_spent,quiz_score,completed,rating
0,204,353,6,5,45.092769,89.264885,1,1
1,717,312,5,2,52.550046,53.016963,0,5
2,873,286,1,1,17.837586,90.73202,1,1
3,388,359,2,5,43.806342,50.294897,1,4
4,463,289,2,2,88.959196,97.029047,0,4


In [None]:
# Step 2: Preprocessing and Encoding

# Encode categorical variables (student interests and material subject)
le_interests = LabelEncoder()
student_df['interests_encoded'] = le_interests.fit_transform(student_df['interests'])
material_df['subject_encoded'] = le_interests.fit_transform(material_df['subject'])


# Merge engagement data with student and material data
merged_data = pd.merge(engagement_df, student_df, on='student_id')
merged_data = pd.merge(merged_data, material_df, on='material_id')

# Select features for training
features = ['year', 'average_quiz_score', 'interests_encoded', 'subject_encoded', 'difficulty_level', 'popularity_score']
X = merged_data[features]

# ***CHANGE***: Encode difficulty_level AFTER creating X
difficulty_mapping = {'Easy': 1, 'Medium': 2, 'Hard': 3}  # Create a mapping for difficulty levels
X['difficulty_level'] = X['difficulty_level'].map(difficulty_mapping)  # Apply the mapping to the feature column


# Use the rating as the target variable (1-5)
y = merged_data['rating'].apply(lambda x: 1 if x >= 4 else 0)  # Binary classification (engaged or not engaged)

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Predict and Evaluate the Model
y_pred = rf_model.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output evaluation metrics
evaluation_metrics = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1
}

evaluation_metrics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['difficulty_level'] = X['difficulty_level'].map(difficulty_mapping)  # Apply the mapping to the feature column


{'accuracy': 0.5,
 'precision': 0.25,
 'recall': 0.21428571428571427,
 'f1_score': 0.23076923076923078}

In [None]:
import numpy as np
import pandas as pd
from faker import Faker
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE



# Step 3: Success Prediction Model
# Define a target variable based on the completion of the entire pathway
engagement_df['completed_pathway'] = engagement_df.groupby('student_id')['completed'].transform('mean')

# Features for the model
features = ['logins', 'videos_watched', 'time_spent', 'quiz_score']
X = engagement_df[features]
y = (engagement_df['completed_pathway'] > 0.5).astype(int)  # Predict if they complete more than 50% of the pathway

# Handling class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Logistic Regression model for predicting pathway completion
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = logreg_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output evaluation metrics
evaluation_metrics = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1
}

evaluation_metrics


{'accuracy': 0.5909090909090909,
 'precision': 0.6,
 'recall': 0.5454545454545454,
 'f1_score': 0.5714285714285714}