In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree  import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import  RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report as cl_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [61]:
import numpy as np
import pandas as pd
import cv2
import random

CANVAS = 280
FINAL = 28
SAMPLES_PER_DIGIT = 200

def draw_stroke(img, points, thickness):
    for i in range(len(points) - 1):
        cv2.line(img, points[i], points[i + 1], 255, thickness, cv2.LINE_AA)

def jitter(point, scale=8):
    return (
        int(point[0] + np.random.randn() * scale),
        int(point[1] + np.random.randn() * scale)
    )

def digit_paths(d):
    c = CANVAS // 2

    if d == 0:
        return [[
            (c-40, c-60), (c+40, c-60),
            (c+60, c), (c+40, c+60),
            (c-40, c+60), (c-60, c),
            (c-40, c-60)
        ]]

    if d == 1:
        return [[(c, c-70), (c, c+70)]]

    if d == 2:
        return [
            [(c-50, c-60), (c+50, c-60), (c+40, c-10)],
            [(c+40, c-10), (c-40, c+60), (c+50, c+60)]
        ]

    if d == 3:
        return [
            [(c-40, c-60), (c+40, c-60)],
            [(c+40, c-60), (c+40, c)],
            [(c+40, c), (c-40, c)],
            [(c+40, c), (c+40, c+60)],
            [(c-40, c+60), (c+40, c+60)]
        ]

    if d == 4:
        return [
            [(c-50, c), (c+50, c)],
            [(c+30, c-70), (c+30, c+70)]
        ]

    if d == 5:
        return [
            [(c+50, c-60), (c-40, c-60), (c-40, c)],
            [(c-40, c), (c+40, c), (c+40, c+60), (c-40, c+60)]
        ]

    if d == 6:
        return [[
            (c+40, c-50), (c-40, c),
            (c-40, c+50), (c+40, c+50),
            (c+40, c)
        ]]

    if d == 7:
        return [[(c-50, c-60), (c+50, c-60), (c, c+70)]]

    if d == 8:
        return [
            [(c-40, c-60), (c+40, c-60), (c+40, c), (c-40, c), (c-40, c-60)],
            [(c-40, c), (c+40, c), (c+40, c+60), (c-40, c+60), (c-40, c)]
        ]

    if d == 9:
        return [[
            (c-40, c+20), (c+40, c-40),
            (c+40, c-70), (c-40, c-70)
        ]]

def generate_digit(d):
    img = np.zeros((CANVAS, CANVAS), dtype=np.uint8)
    thickness = random.randint(14, 18)

    for path in digit_paths(d):
        pts = [jitter(p) for p in path]
        draw_stroke(img, pts, thickness)

    angle = random.uniform(-15, 15)
    M = cv2.getRotationMatrix2D((CANVAS//2, CANVAS//2), angle, 1)
    img = cv2.warpAffine(img, M, (CANVAS, CANVAS))

    img = cv2.resize(img, (FINAL, FINAL), interpolation=cv2.INTER_AREA)
    return img.flatten()

data = []
labels = []

for d in range(10):
    for _ in range(SAMPLES_PER_DIGIT):
        data.append(generate_digit(d))
        labels.append(d)

df = pd.DataFrame(data, columns=[f"p{i}" for i in range(784)])
df["digit"] = labels

In [45]:
df.sample(3)

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,...,p775,p776,p777,p778,p779,p780,p781,p782,p783,digit
966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
1497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [28]:
X, y = df.drop("digit", axis = 1), df["digit"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= .7, random_state=42, shuffle=True)

## Model Training

In [35]:
lr_model = LogisticRegressionCV(
                cv = 3,
                Cs = [0.3, 0.5, 1 ,2], 
                penalty="l2", 
                solver="lbfgs", 
                multi_class="auto", 
                max_iter=800, 
                scoring="f1_macro", 
                n_jobs= -1, 
                refit = True
)

lr_model.fit(X_train, y_train)

train_pred, test_pred = lr_model.predict(X_train), lr_model.predict(X_test)

train_report, test_report = cl_report(y_train, train_pred), cl_report(y_test, test_pred)

print(test_report)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       0.99      0.99      0.99        69
           2       1.00      0.97      0.99        72
           3       0.98      1.00      0.99        55
           4       1.00      1.00      1.00        54
           5       0.98      1.00      0.99        49
           6       1.00      1.00      1.00        57
           7       0.98      0.98      0.98        62
           8       1.00      1.00      1.00        64
           9       1.00      1.00      1.00        66

    accuracy                           0.99       600
   macro avg       0.99      0.99      0.99       600
weighted avg       0.99      0.99      0.99       600



In [36]:
print(train_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       148
           1       1.00      1.00      1.00       131
           2       1.00      1.00      1.00       128
           3       1.00      1.00      1.00       145
           4       1.00      1.00      1.00       146
           5       1.00      1.00      1.00       151
           6       1.00      1.00      1.00       143
           7       1.00      1.00      1.00       138
           8       1.00      1.00      1.00       136
           9       1.00      1.00      1.00       134

    accuracy                           1.00      1400
   macro avg       1.00      1.00      1.00      1400
weighted avg       1.00      1.00      1.00      1400



###  Mixing Created data with the past data for giving more grip to the model

In [62]:
df_real = pd.read_csv("data/real_streamlit_samples.csv")

df_real.shape

(605, 785)

In [63]:
df_real.insert(len(df.columns)-1, "digit", df_real.pop("label"))

  df_real.insert(len(df.columns)-1, "digit", df_real.pop("label"))


In [66]:
df = pd.concat([df, df_real], ignore_index=True)

df.sample(3)

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,...,p775,p776,p777,p778,p779,p780,p781,p782,p783,digit
2536,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
2521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
1328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6


In [68]:
X, y = df.drop("digit", axis = 1), df["digit"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, random_state=42, shuffle=True)

## Retraining the model using new combination of data

In [69]:
lr_model = LogisticRegressionCV(
                cv = 3,
                Cs = [0.3, 0.5, 1 ,2], 
                penalty="l2", 
                solver="lbfgs", 
                multi_class="auto", 
                max_iter=800, 
                scoring="f1_macro", 
                n_jobs= -1, 
                refit = True
)

lr_model.fit(X_train, y_train)

train_pred, test_pred = lr_model.predict(X_train), lr_model.predict(X_test)

train_report, test_report = cl_report(y_train, train_pred), cl_report(y_test, test_pred)

print(test_report)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00        42
           2       1.00      1.00      1.00        54
           3       1.00      1.00      1.00        46
           4       1.00      1.00      1.00        58
           5       1.00      0.98      0.99        46
           6       1.00      1.00      1.00        63
           7       1.00      1.00      1.00        68
           8       0.98      1.00      0.99        53
           9       1.00      1.00      1.00        55

    accuracy                           1.00       521
   macro avg       1.00      1.00      1.00       521
weighted avg       1.00      1.00      1.00       521



In [70]:
print(train_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      0.99      0.99       180
           2       0.99      1.00      0.99       174
           3       1.00      1.00      1.00       186
           4       1.00      1.00      1.00       199
           5       1.00      1.00      1.00       218
           6       1.00      1.00      1.00       248
           7       1.00      1.00      1.00       221
           8       1.00      1.00      1.00       247
           9       1.00      1.00      1.00       247

    accuracy                           1.00      2084
   macro avg       1.00      1.00      1.00      2084
weighted avg       1.00      1.00      1.00      2084



In [71]:
import joblib

joblib.dump(lr_model, "digit_lr_model.pkl")

['digit_lr_model.pkl']