Syntetic Data Generator

In [20]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification, make_regression
import random

# Set a random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Function to generate synthetic data with relatable features for dyslexia platform
def generate_synthetic_data(num_samples=1000):
    # Generate classification data (performance level prediction)
    X_class, y_class = make_classification(n_samples=num_samples, n_features=5, n_classes=2, random_state=42)  # Binary classification
    
    # Generate regression data for reading fluency (assuming this is important in your dyslexia platform)
    X_reg, y_reg = make_regression(n_samples=num_samples, n_features=5, noise=0.1, random_state=42)
    
    # Create synthetic student data
    student_data = pd.DataFrame(X_class, columns=['reading_speed', 'task_difficulty', 'reading_comprehension', 'engagement_level', 'error_rate'])
    
    # Task completion time correlated with reading fluency and mood/engagement
    task_completion_time = X_reg[:, 0] + np.random.normal(0, 0.5, num_samples)  # Adding noise to completion time
    
    # Mood/engagement score influenced by success rate (simulating how mood/engagement impacts learning)
    mood_engagement_score = (y_class * 2) + np.random.randint(1, 6, num_samples)  # Making mood score related to performance level
    
    # Success rate based on task completion time and mood/engagement
    success_rate = 0.7 * (1 - task_completion_time / np.max(task_completion_time)) + 0.3 * (mood_engagement_score / 10)  # A weighted sum
    
    # Create the DataFrame with the new correlations
    student_data['task_completion_time'] = task_completion_time
    student_data['mood_engagement_score'] = mood_engagement_score
    student_data['success_rate'] = success_rate
    
    # Add the target columns for classification and regression
    student_data['performance_level'] = y_class  # Classification target (performance level)
    student_data['reading_fluency_score'] = y_reg + np.random.normal(0, 1, num_samples)  # Regression target with noise
    
    return student_data

# Generate the synthetic data
num_samples = 1000
synthetic_data = generate_synthetic_data(num_samples)

# Save the synthetic data as a CSV file
synthetic_data.to_csv('synthetic_dyslexia_data.csv', index=False)

# Optionally, print the first few rows of the data to check
synthetic_data.head(20)


Unnamed: 0,reading_speed,task_difficulty,reading_comprehension,engagement_level,error_rate,task_completion_time,mood_engagement_score,success_rate,performance_level,reading_fluency_score
0,-0.439643,0.542547,-0.82242,0.401366,-0.85484,2.304901,4,0.323022,0,74.778482
1,2.822231,-2.480859,-1.147691,-2.101131,3.040278,-0.868324,3,0.977226,1,-42.526773
2,1.618386,-1.369478,-2.084113,-1.179659,1.613602,1.399851,5,0.548167,1,-24.710152
3,1.659048,-0.615202,1.112688,-0.835098,-0.272205,0.65826,7,0.768067,1,-87.606834
4,1.849824,-1.679456,-0.926698,-1.402509,2.123129,-2.18048,5,1.320151,1,-86.37957
5,0.077111,0.23716,0.584413,0.087275,-0.661753,-0.332678,3,0.861731,0,-38.939408
6,-0.388754,0.563365,0.014273,0.394581,-0.960128,1.403124,1,0.427461,0,5.115695
7,2.040959,-0.508362,-1.776235,-0.909452,-0.941722,0.617742,3,0.656804,1,72.312183
8,1.052441,-0.048267,-0.917503,-0.367493,-1.007995,1.35389,3,0.498077,1,60.962256
9,0.763016,-0.15837,0.886887,-0.324968,-0.429449,0.453781,2,0.662157,0,-16.477455


Train a Classification Model

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset
synthetic_data = pd.read_csv('synthetic_dyslexia_data.csv')

# Features and target variable
features = ['reading_speed', 'task_difficulty', 'reading_comprehension', 'engagement_level', 'error_rate', 'task_completion_time', 'mood_engagement_score', 'success_rate']
target = 'performance_level'

X = synthetic_data[features]
y = synthetic_data[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model (Random Forest Classifier in this case)
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))




Accuracy: 94.50%
              precision    recall  f1-score   support

           0       0.93      0.96      0.94        97
           1       0.96      0.93      0.95       103

    accuracy                           0.94       200
   macro avg       0.95      0.95      0.94       200
weighted avg       0.95      0.94      0.95       200



Train a Regression Model

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Features and target variable
features = ['reading_speed', 'task_difficulty', 'reading_comprehension', 'engagement_level', 'error_rate', 'task_completion_time', 'mood_engagement_score', 'success_rate']
target = 'reading_fluency_score'

X = synthetic_data[features]
y = synthetic_data[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model (Random Forest Regressor)
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 3266.53
R^2 Score: 0.12


Prediction with Classification Model

In [16]:
# Example: Making predictions for a new student 
# features = ['reading_speed', 'task_difficulty', 'reading_comprehension', 'engagement_level', 'error_rate', 'task_completion_time', 
# 'mood_engagement_score', 'success_rate']
new_student = np.array([[3.5, 0.6, 7, 8, 0.1, 3.0, 6, 0.85]])  # Example features for a new student
predicted_performance_level = model.predict(new_student)
print(f"Predicted Performance Level: {predicted_performance_level[0]}")

Predicted Performance Level: 1




Prediction with Regression Model

In [19]:

# Example: Making predictions for a new student
# features = ['reading_speed', 'task_difficulty', 'reading_comprehension', 'engagement_level', 'error_rate', 
# 'task_completion_time', 'mood_engagement_score', 'success_rate']
new_student = np.array([[3.5, 0.6, 7, 8, 0.1, 3.0, 6, 0.85]])  # Example features for a new student
predicted_reading_fluency = model.predict(new_student)
print(f"Predicted Reading Fluency: {predicted_reading_fluency[0]:.2f}")


Predicted Reading Fluency: 0.83




Save Model

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Generate synthetic data (or load your dataset)
synthetic_data = pd.read_csv('synthetic_dyslexia_data.csv')

# Features and target variable for classification (performance_level)
features = ['reading_speed', 'task_difficulty', 'reading_comprehension', 'engagement_level', 'error_rate', 'task_completion_time', 'mood_engagement_score', 'success_rate']
target_class = 'performance_level'

# Features and target variable for regression (reading_fluency_score)
target_reg = 'reading_fluency_score'

X = synthetic_data[features]
y_class = synthetic_data[target_class]
y_reg = synthetic_data[target_reg]

# Split the data into training and test sets
X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42)

# Initialize and train the models
classification_model = RandomForestClassifier(random_state=42)
classification_model.fit(X_train, y_train_class)

regression_model = RandomForestRegressor(random_state=42)
regression_model.fit(X_train, y_train_reg)

# Now the models are trained and ready for saving
print("Models trained successfully.")


Models trained successfully.


In [3]:
import pickle

In [8]:
model_path = "classification_model.pkl"
with open(model_path, "wb") as f:
    pickle.dump(classification_model, f)
print("Model saved successfully.")

Model saved successfully.


In [9]:
model_path = "regression_model.pkl"
with open(model_path, "wb") as f:
    pickle.dump(regression_model, f)
print("Model saved successfully.")

Model saved successfully.


In [25]:
# Example: Making a prediction for classification
new_student_classification_data = np.array([[3.5, 0.6, 7, 8, 0.1, 3.0, 6, 0.85]])  # Example features
classification_prediction = classification_model.predict(new_student_classification_data)

# Example: Making a prediction for regression
new_student_regression_data = np.array([[3.5, 0.6, 7, 8, 0.1, 3.0, 6, 0.85]])  # Example features
regression_prediction = regression_model.predict(new_student_regression_data)

print(f"Classification prediction: {classification_prediction}")
print(f"Regression prediction: {regression_prediction}")


Classification prediction: [1]
Regression prediction: [0.83248512]




Connect ML to webapp

In [None]:
import json
import sys
import pandas as pd
from sklearn.externals import joblib

data = json.loads(sys.argv[1])
model = joblib.load('classification_model.pkl')

predictions = model.predict(pd.DataFrame(data))
print(json.dumps({'labels': ['A', 'B'], 'values': predictions.tolist()}))


In [None]:
import json
import sys
import numpy as np
from sklearn.externals import joblib

# Load models
regression_model = joblib.load('regression_model.pkl')

# Get input data
input_data = json.loads(sys.argv[1])
features = np.array(input_data['featureData'])

# Make predictions
predicted_values = regression_model.predict(features)

# Return as JSON
output = {
    'timeLabels': ['Week 1', 'Week 2', 'Week 3'],
    'predictedValues': predicted_values.tolist(),
}
print(json.dumps(output))


Test the prediction on frontend

In [1]:
import requests

url = "http://127.0.0.1:5000/predict"
headers = {"Content-Type": "application/json"}
data = {"features": [-0.439643, 0.542547,	-0.822420,	0.401366,	-0.854840,	2.304901,	4,	0.323022]}

response = requests.post(url, json=data, headers=headers)
print("Response:", response.json())


Response: {'error': '4 columns passed, passed data had 8 columns'}


In [11]:
import pickle

model_path = "model/classification_model.pkl"
with open(model_path, "rb") as file:
    model = pickle.load(file)

print(type(model))


<class 'sklearn.ensemble._forest.RandomForestClassifier'>
