In [12]:
import numpy as np

X = np.load("../data/language/processed/X_features.npy")
y = np.load("../data/language/processed/y_labels.npy")

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (154, 7)
y shape: (154,)


Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)


Train: (115, 7)
Test: (39, 7)


Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

lr = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("Classification Report (Language Model)")
print(classification_report(y_test, y_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))


Classification Report (Language Model)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00        38

    accuracy                           1.00        39
   macro avg       1.00      1.00      1.00        39
weighted avg       1.00      1.00      1.00        39

Confusion Matrix
[[ 1  0]
 [ 0 38]]


In [16]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression

base_model = LogisticRegression(class_weight="balanced", max_iter=1000)

calibrated_model = CalibratedClassifierCV(
    base_model,
    method="sigmoid",   # Platt scaling
    cv=2
)

calibrated_model.fit(X_train, y_train)


ValueError: Requesting 2-fold cross-validation but provided less than 2 examples for at least one class.

SVM

In [5]:
from sklearn.svm import SVC

svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

print("SVM Results")
print(classification_report(y_test, y_pred_svm))


SVM Results
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.97      1.00      0.99        38

    accuracy                           0.97        39
   macro avg       0.49      0.50      0.49        39
weighted avg       0.95      0.97      0.96        39



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Results")
print(classification_report(y_test, y_pred_rf))


Random Forest Results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00        38

    accuracy                           1.00        39
   macro avg       1.00      1.00      1.00        39
weighted avg       1.00      1.00      1.00        39



In [7]:
from sklearn.metrics import confusion_matrix

print("Confusion Matrix (Logistic Regression)")
print(confusion_matrix(y_test, y_pred_rf))
#Proves no normal detected , Everything flagged dyslexic


Confusion Matrix (Logistic Regression)
[[ 1  0]
 [ 0 38]]


In [8]:
#Using stratified cross validation 
#Using stratified k-fold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
recalls = []

for train_idx, test_idx in skf.split(X, y):
    X_tr, X_te = X[train_idx], X[test_idx]
    y_tr, y_te = y[train_idx], y[test_idx]

    model = LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    )
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    recalls.append(recall_score(y_te, y_pred))

print("Cross-validated recall:", sum(recalls)/len(recalls))



Cross-validated recall: 1.0




Feature Importance

In [9]:
feature_names = [
    "spelling_error_rate",
    "non_word_ratio",
    "phonetic_error_ratio",
    "repetition_score",
    "avg_word_length",
    "word_count",
    "rare_word_ratio"
]

for name, coef in zip(feature_names, lr.coef_[0]):
    print(f"{name:25s}: {coef:.4f}")


spelling_error_rate      : 0.0037
non_word_ratio           : 0.0037
phonetic_error_ratio     : 0.0000
repetition_score         : 0.0000
avg_word_length          : 0.0041
word_count               : -0.0060
rare_word_ratio          : 0.0037


In [10]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(lr, "../models/language_risk_model.pkl")
print("Language risk model saved")


Language risk model saved
