In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, make_scorer
import pandas as pd
from bertopic import BERTopic
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


### Grid search is all the way at the bottom, load the data up here then I think you can just skip to the bottom mega brick of code that has grid search and run that and it works.
  
  
  Currently using multinomial Naive Bayes as based on my preliminary research it seemed best for what we were doing however I could potentially be wrong about that so we could always switch it to something else

In [3]:
transform = transforms.Compose([ #normalized data, can be done without normalized data but threshold will need to be changed
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

In [5]:
def regions(image, size): #divide image into non overlapping regions of specified size
    image.numpy()
    regions = []
    for i in range(0, 28, size): 
        for j in range(0, 28, size):
            region = image[0, i:i+size, j:j+size]
            regions.append(region)
    return regions

def encode(region, threshold):
    region = torch.where(region < threshold, 0, 1)  # Vectorized thresholding
    binary_str = ''.join(map(str, region.flatten().int().tolist()))  # Convert to binary
    return int(binary_str, 2)  # Convert binary to decimal

In [96]:
region_size = 4 # for 4x4 region max binary value is 65535
threshold = -0.5

encoded_images = []
for image, label in train_dataset: #with the for loops, took my laptop around 30 min to run on 4x4
    r = regions(image, region_size)
    for i in range(len(r)):
        r[i] = encode(r[i], threshold)
    encoded_images.append(r) #in theory this encodes all the regions and stores them. each image is its own list of decimal regions
    
for i in range(len(encoded_images)):
    encoded_images[i] = [x for x in encoded_images[i] if x != 0] #remove any regions that are just 0 (all black/no important features according to threshold)

In [97]:
text_data = [' '.join(map(str,encoded_images[i])) for i in range(len(encoded_images))] #joins the regions together into one string instead of a list of strings

df = {'Document':text_data, 'Label':train_dataset.targets.tolist()} #dataframe of images and labels
df = pd.DataFrame(df)
print('LENGTH: ', len(df))

groupeddf = df.groupby('Label', as_index=False).agg({'Document': ''.join}) #dataframe of documents grouped by class (all documents of one class are combined into a mega document)
print('LENGTH: ', len(groupeddf))
nums = groupeddf['Document']
labels = groupeddf['Label']

LENGTH:  60000
LENGTH:  10


In [98]:
groupeddf.head()

Unnamed: 0,Label,Document
0,0,311 61439 12 4407 65500 64896 52430 19 64716 6...
1,1,887 2184 17 65518 14335 51200 4403 61132 30719...
2,2,63 1791 14194 65216 13175 3 2047 32751 311 652...
3,3,55 2047 4095 2252 28672 61440 62335 52424 887 ...
4,4,76 3276 1 52428 273 52360 4369 52431 31 255 16...


In [99]:
#cvectorizer = CountVectorizer()
#count = cvectorizer.fit_transform(features)
#nums = cvectorizer.get_feature_names_out()
ctfidf, features = BERTopic()._c_tf_idf(groupeddf, fit=True) #class based TFIDF
ctfidf_array = ctfidf.toarray()

score_threshold = 0.03

topdata = []
topweights = []
scaled_top_words = []
for idx, topic in enumerate(groupeddf['Label']): #print the top 10 words for each class
    print(f"Top words for Class {topic}:")
    top_indices = [i for i in range(len(features)) if ctfidf_array[idx][i] > score_threshold]
    top_words = [features[i] for i in top_indices]
    top_weights = [ctfidf_array[idx][i] for i in top_indices]
    topdata.append(top_words)
    topweights.append(top_weights)
    print(top_words)
    print(top_weights)
    
    expanded_words = []
    for word, weight in zip(top_words, top_weights):
        count = round(weight * 1000)
        expanded_words.extend([word] * count)

    # Append expanded words list for this class
    scaled_top_words.append(expanded_words)
    
#For whatever reason it pulls in the same number as the highest rated number for all classes
#Can we just scrap those or is it a broader issue and are the rest of the numbers wrong?
#Same thing happened when the model was broken and was only 1s, it had all 1s preceded by the class number
#Here it is all 0s preceded by the class number
#also maybe need to go even bigger on the region based on the fact that the highest rated number for 3 classes is a white brick of all 1s

Top words for Class 0:
['65535', '32768', '30583', '34952', '65534', '52428', '13107', '4369', '4095', '32767']
[0.054236074980597265, 0.03819115522705638, 0.0371329964032447, 0.03636803701866773, 0.03367587900251205, 0.032597379521376556, 0.031827154556204384, 0.031385897211588146, 0.030283610631406856, 0.03019481598182678]
Top words for Class 1:
['30583', '34952', '13107', '34816', '65535', '32768', '13175', '4403', '61132', '17']
[0.13020339861054825, 0.09340019731559018, 0.06718764045033856, 0.05533996791573236, 0.05267887757797038, 0.048416603798303176, 0.043771310575540935, 0.04353944867440237, 0.043515719627089125, 0.042677337087875154]
Top words for Class 2:
['65535', '65520', '15', '65280', '32768', '4096', '12', '32767', '255', '61440']
[0.06041848293700579, 0.04309225396078819, 0.038523567441857344, 0.036908608170267095, 0.03130764882374524, 0.03115215974522641, 0.026969546389008423, 0.026936917223232507, 0.026153947507166084, 0.024963010483831014]
Top words for Class 3:
['4

In [100]:
top_weights=[]
for idx, topic in enumerate(groupeddf['Label']): #print the top 10 words for each class
    print(f"Top weights for Class {topic}:")
    top_weights.append(ctfidf_array[idx].argsort()[-10:][::-1])
    print(top_weights)

Top weights for Class 0:
[array([18398,  5907,  4856,  7135, 18345, 11305,   777,  8710,  8490,
        5901], dtype=int64)]
Top weights for Class 1:
[array([18398,  5907,  4856,  7135, 18345, 11305,   777,  8710,  8490,
        5901], dtype=int64), array([ 4856,  7135,   777,  6684, 18398,  5907,   881,  8737, 14607,
        1773], dtype=int64)]
Top weights for Class 2:
[array([18398,  5907,  4856,  7135, 18345, 11305,   777,  8710,  8490,
        5901], dtype=int64), array([ 4856,  7135,   777,  6684, 18398,  5907,   881,  8737, 14607,
        1773], dtype=int64), array([18398, 18232,  1368, 17626,  5907,  8505,   126,  5901,  2939,
       15359], dtype=int64)]
Top weights for Class 3:
[array([18398,  5907,  4856,  7135, 18345, 11305,   777,  8710,  8490,
        5901], dtype=int64), array([ 4856,  7135,   777,  6684, 18398,  5907,   881,  8737, 14607,
        1773], dtype=int64), array([18398, 18232,  1368, 17626,  5907,  8505,   126,  5901,  2939,
       15359], dtype=int64), array

### Naive Bayes for text data
multinomial focuses on the frequency of a word?  
bernoulli focuses on if a word appears?  

In [101]:
encoded_test = []
for image, label in test_dataset: #with the for loops, took my laptop around 30 min to run on 4x4
    r = regions(image, region_size)
    for i in range(len(r)):
        r[i] = encode(r[i], threshold)
    encoded_test.append(r) #in theory this encodes all the regions and stores them. each image is its own list of decimal regions
    
for i in range(len(encoded_test)):
    encoded_test[i] = [x for x in encoded_test[i] if x != 0] #remove any regions that are just 0 (all black/no important features according to threshold)
    
text_test = [' '.join(map(str,encoded_test[i])) for i in range(len(encoded_test))]

In [102]:
top_data = [' '.join(map(str,topdata[i])) for i in range(len(topdata))]
traindata = pd.DataFrame({'Features':top_data, 'Class': labels})
X_train = traindata['Features']
y_train = traindata['Class']
X_test = text_test
y_test = test_dataset.targets.tolist()

In [110]:
top_data_scaled = [' '.join(map(str,scaled_top_words[i])) for i in range(len(scaled_top_words))]
traindata_scaled = pd.DataFrame({'Features':top_data_scaled, 'Class': labels})
X_train_scaled = traindata_scaled['Features']
y_train_scaled = traindata_scaled['Class']
X_test = text_test
y_test = test_dataset.targets.tolist()

In [112]:
print(len(X_train[0]))
print(len(X_train_scaled[0]))

57
2068


In [104]:
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_train_vectors_scaled = vectorizer.fit_transform(X_train_scaled)
X_test_vectors = vectorizer.transform(X_test)

In [None]:
model = MultinomialNB()
model2 = BernoulliNB()
model.fit(X_train_vectors, y_train)
model2.fit(X_train_vectors, y_train)
y_pred = model.predict(X_test_vectors)
y_pred2 = model2.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy * 100:.2f}%\n") #7x7 19-22%, 14x14 ~12-13%, 4x4 21-22% (-0.5 Binarization Threshold) (TOP 10 WORDS NO THRESHOLD)
print(f"Accuracy: {accuracy2 * 100:.2f}%\n")

Accuracy: 22.18%

Accuracy: 21.57%



In [None]:
model = MultinomialNB()
model2 = BernoulliNB()
model.fit(X_train_vectors_scaled, y_train)
model2.fit(X_train_vectors_scaled, y_train)
y_pred = model.predict(X_test_vectors)
y_pred2 = model2.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy * 100:.2f}%\n") #7x7 19-22%, 14x14 ~12-13%, 4x4 21-22% (-0.5 Binarization Threshold) (TOP 10 WORDS NO THRESHOLD)
print(f"Accuracy: {accuracy2 * 100:.2f}%\n")

Accuracy: 22.18%

Accuracy: 21.57%



In [107]:
# vectorizer = TfidfVectorizer() #individual document tfidf instead of class based tfidf
# tfidf_matrix = vectorizer.fit_transform(text_data)

# tfidf_array = tfidf_matrix.toarray()

# feature_names = vectorizer.get_feature_names_out()


# filtered_images = []
# for i, image in enumerate(encoded_images):
#     filtered_image = [
#         int(feature_names[j])  # Convert feature name (string) back to integer
#         for j in range(len(feature_names))
#         if tfidf_array[i, j] >= 0.5  # Keep only values with TF-IDF >= 0.5
#     ]
#     filtered_images.append(filtered_image)

# #train_dataset = [filtered_images, train_dataset[:][[1]]]

KeyboardInterrupt: 

In [4]:

# Function to perform encoding, used by the grid search
def process_data(region_size, threshold, score_threshold, dataset):
    def pad_image(image, region_size):
        height, width = image.shape[1], image.shape[2]
        pad_height = (region_size - (height % region_size)) % region_size
        pad_width = (region_size - (width % region_size)) % region_size
        padded_image = torch.nn.functional.pad(image, (0, pad_width, 0, pad_height), mode='constant', value=0)
        return padded_image

    def regions(image, size):
        padded_image = pad_image(image, size)
        regions = []
        for i in range(0, padded_image.shape[1], size):
            for j in range(0, padded_image.shape[2], size):
                region = padded_image[0, i:i+size, j:j+size]
                regions.append(region)
        return regions

    def encode(region, threshold):
        region = torch.where(region < threshold, 0, 1)  # Vectorized thresholding
        binary_str = ''.join(map(str, region.flatten().int().tolist()))  # Convert to binary
        return int(binary_str, 2)  # Convert binary to decimal

    encoded_images = []
    for image, label in dataset:
        r = regions(image, region_size)
        for i in range(len(r)):
            r[i] = encode(r[i], threshold)
        encoded_images.append(r)

    # Remove empty regions (regions where no features were detected based on the threshold)
    for i in range(len(encoded_images)):
        encoded_images[i] = [x for x in encoded_images[i] if x != 0]  # Remove any regions that are just 0 (all black/no important features)

    text_data = [' '.join(map(str, encoded_images[i])) for i in range(len(encoded_images))]
    return text_data

# Wrapper function for the model and parameter search
def model_with_params(region_size, threshold, score_threshold, train_dataset, test_dataset):
    # Process data for both training and testing
    X_train = process_data(region_size, threshold, score_threshold, train_dataset)
    X_test = process_data(region_size, threshold, score_threshold, test_dataset)

    # Vectorize the data
    vectorizer = CountVectorizer()
    X_train_vectors = vectorizer.fit_transform(X_train)
    X_test_vectors = vectorizer.transform(X_test)

    # Train the model
    model = MultinomialNB()
    model.fit(X_train_vectors, train_dataset.targets.tolist())

    # Predict and calculate accuracy
    y_pred = model.predict(X_test_vectors)
    accuracy = accuracy_score(test_dataset.targets.tolist(), y_pred)
    
    return accuracy

# GridSearchCV wrapper for optimization
def grid_search(train_dataset, test_dataset):
    # Define the parameter grid for region_size, threshold, and score_threshold
    param_grid = {
        'region_size': [5, 6, 7], 
        'threshold': [-0.4, -0.3, -0.2, -0.1],  
        'score_threshold': [0.009, .008, .007, 0.01, 0.02]  
    }

    # Custom evaluation function for GridSearchCV
    def score_fn(model, X, y):
        # Make predictions and calculate accuracy
        y_pred = model.predict(X)
        return accuracy_score(y, y_pred)

    # Create a custom scorer function for GridSearchCV
    grid_search = GridSearchCV(estimator=None, param_grid=param_grid, scoring=make_scorer(score_fn), cv=3, n_jobs=-1)

    # Perform grid search
    best_params = None
    best_score = -1
    for region_size in param_grid['region_size']:
        for threshold in param_grid['threshold']:
            for score_threshold in param_grid['score_threshold']:
                # Train the model with the current parameters
                score = model_with_params(region_size, threshold, score_threshold, train_dataset, test_dataset)
                if score > best_score:
                    best_score = score
                    best_params = {
                        'region_size': region_size,
                        'threshold': threshold,
                        'score_threshold': score_threshold
                    }

    return best_params, best_score

# Example usage of the grid search
best_params, best_score = grid_search(train_dataset, test_dataset)

# Print the best parameters and score
print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")
#print out all runs going forward to identify the effects of parameters

#~60 min to run, 71.2% accuracy, 6, -0.3, .01
# 'region_size': [4, 5, 6, 7, 8, 9, 10?], 
# 'threshold': [-0.3, -0.5, -0.7]?,  
# 'score_threshold': [.01, .02, .03]?  


#~160 min to run, 72.04% accuracy, 6, -0.4, .009
# 'region_size': [5, 6, 7], 
# 'threshold': [-0.4, -0.3, -0.2, -0.1],  
# 'score_threshold': [0.009, .008, .007, 0.01, 0.02]  

Best Parameters: {'region_size': 6, 'threshold': -0.4, 'score_threshold': 0.009}
Best Score: 0.7204


### Thoughts
Experiment with bernoullis model? maybe a different version as well  
Also consider scaling the words for the multinomial model? may be worth a shot but based on my previous experiments with just the top 10 words it didnt really make a difference, but it may make a difference now with the ctfidf score threshold since each class will have a different amount of features within them and it wont just be exactly 10 words in each class getting scaled.

### Things to test
Outputs for each individual run to see what the individual parameters do  
Test overlappping layers  
Test regular TFIDF  
Test different versions of Naive Bayes  
Test other datasets  
Also explain ctfidf in presentation