In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
transform = transforms.Compose([ #normalized data, can be done without normalized data but threshold will need to be changed
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

In [5]:
def regions(image, size): #divide image into non overlapping regions of specified size
    image.numpy()
    regions = []
    for i in range(0, 28, size): 
        for j in range(0, 28, size):
            region = image[0, i:i+size, j:j+size]
            regions.append(region)
    return regions

def encode(region, threshold): # Currently very inefficient, need to get rid of for loops but works for now
    for i in range(len(region)):
        for j in range(len(region[0])):
            region[i][j] = 0 if region[i][j]< threshold else 1
    binary = ''.join(map(str, region.numpy().flatten().astype(int).tolist())) #bunch of mumbo jumbo to 'unfold' the region and turn it into one binary string
    value = int(binary, 2) #convert binary to decimal
    return value

In [30]:
region_size = 7 # for 4x4 region max binary value is 65535
threshold = -0.5

encoded_images = []
for image, label in train_dataset: #with the for loops, took my laptop around 30 min to run on 4x4
    r = regions(image, region_size)
    for i in range(len(r)):
        r[i] = encode(r[i], threshold)
    encoded_images.append(r) #in theory this encodes all the regions and stores them. each image is its own list of decimal regions
    
for i in range(len(encoded_images)):
    encoded_images[i] = [x for x in encoded_images[i] if x != 0] #remove any regions that are just 0 (all black/no important features according to threshold)
    


In [31]:
text_data = [' '.join(map(str,encoded_images[i])) for i in range(len(encoded_images))]
print(len(text_data))
print(text_data[2])

60000
2 129 8256 26596060923662 4433217897375 283691179835392 62089926279168 35182224605184 557662341367310 62057474949120


In [32]:
text_data = [' '.join(map(str,encoded_images[i])) for i in range(len(encoded_images))] #joins the regions together into one string instead of a list of strings

traindata = {'Document':text_data, 'Label':train_dataset.targets.tolist()} #dataframe of images and labels
traindf = pd.DataFrame(traindata)
print('LENGTH: ', len(traindf))

groupeddf = traindf.groupby('Label', as_index=False).agg({'Document': ''.join}) #dataframe of documents grouped by class (all documents of one class are combined into a mega document)
print('LENGTH: ', len(groupeddf))
nums = groupeddf['Document']
labels = groupeddf['Label']


LENGTH:  60000
LENGTH:  10


In [33]:
groupeddf.head()

Unnamed: 0,Label,Document
0,0,466814 4640475700784 562948714020993 283691315...
1,1,911 1 67009669708912 13299112724382 4255283140...
2,2,1662 13717922250753 557677373785982 16771 1372...
3,3,143 16383 64 137404593733647 562917504712702 2...
4,4,2 129 8256 26596060923662 4433217897375 283691...


In [34]:
#cvectorizer = CountVectorizer()
#count = cvectorizer.fit_transform(features)
#nums = cvectorizer.get_feature_names_out()

ctfidf, features = BERTopic()._c_tf_idf(groupeddf, fit=True) #class based TFIDF
ctfidf_array = ctfidf.toarray()

for idx, topic in enumerate(groupeddf['Label']): #print the top 10 words for each class
    print(f"Top words for Class {topic}:")
    top_indices = ctfidf_array[idx].argsort()[-10:][::-1]
    top_words = [features[i] for i in top_indices]
    print(top_words)
    
#For whatever reason it pulls in the same number as the highest rated number for all classes
#Can we just scrap those or is it a broader issue and are the rest of the numbers wrong?
#Same thing happened when the model was broken and was only 1s, it had all 1s preceded by the class number
#Here it is all 0s preceded by the class number
#also maybe need to go even bigger on the region based on the fact that the highest rated number for 3 classes is a white brick of all 1s



Top words for Class 0:
['4398046511104', '64', '13228499271680', '562932505116672', '129', '562949684985856', '4432406249472', '13298030395777', '283673999966208', '562949416550400']
Top words for Class 1:
['4432676798593', '425536972664928', '129', '496459801442416', '13298030395779', '4432674684928', '13298024054784', '425528314036224', '16513', '4432406249472']
Top words for Class 2:
['4398046511104', '64', '13194139533312', '4432406249472', '13228499271680', '562915593682944', '8256', '8288', '30786325577728', '558551906910208']
Top words for Class 3:
['64', '16383', '4398046511104', '562949684985856', '4432406249472', '13228499271680', '15484', '96', '12400', '8288']
Top words for Class 4:
['281474976710656', '64', '4432674684928', '8256', '283673999966208', '4432676782080', '4432406249472', '12384', '129', '13298030346240']
Top words for Class 5:
['4398046511104', '16383', '422212465065984', '127', '4432406249472', '64', '281474976710656', '129', '492581209243648', '16513']
Top w

### THOUGHTS?
Optimize encoding  
Create dictionary of top 10 or so words per class  
Use that as the training data in the naive bayes classifier  
Encode the testing data, remove all 0s  
Remove more data?
Classify

In [4]:
vectorizer = TfidfVectorizer() #individual document tfidf instead of class based tfidf
tfidf_matrix = vectorizer.fit_transform(text_data)

tfidf_array = tfidf_matrix.toarray()

feature_names = vectorizer.get_feature_names_out()


filtered_images = []
for i, image in enumerate(encoded_images):
    filtered_image = [
        int(feature_names[j])  # Convert feature name (string) back to integer
        for j in range(len(feature_names))
        if tfidf_array[i, j] >= 0.5  # Keep only values with TF-IDF >= 0.5
    ]
    filtered_images.append(filtered_image)

train_dataset = [filtered_images, train_dataset[:][[1]]]

