# Step1: Import Librarys

In [103]:
from tensorflow.keras.datasets import imdb

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step2: Loading the IMDB dataset




## Step2.1: Load dataset into train and test sets

In [158]:
Limited = 7000
(train_data, train_label), (test_data, test_label) = imdb.load_data(num_words=Limited)

## Step2.2: Showing the mapped integear words

In [105]:
word_index = imdb.get_word_index()
reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()])

In [106]:
for doc_index in range(0,10):
  decoded_review = " ".join([reverse_word_index.get(i - 3, "?") for i in train_data[doc_index]])
  print(f'[{doc_index}]: {decoded_review}')

[0]: ? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big ? for the whole film but these children are amazing and should be praised for what they have done don't you thin

In [107]:
for doc_index in range(0,10):
  print(f'[{doc_index}] \t label:{train_label[doc_index]} \tdoc length:{str(len(train_data[doc_index])).ljust(4)}   doc[0:20]:{train_data[doc_index][0:20]}')

[0] 	 label:1 	doc length:218    doc[0:20]:[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25]
[1] 	 label:0 	doc length:189    doc[0:20]:[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14]
[2] 	 label:0 	doc length:141    doc[0:20]:[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14]
[3] 	 label:1 	doc length:550    doc[0:20]:[1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153, 103, 4, 1494, 13, 70, 131, 67, 11, 61]
[4] 	 label:0 	doc length:147    doc[0:20]:[1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 14, 20, 56, 33, 2401, 18, 457, 88, 13, 2626]
[5] 	 label:0 	doc length:43     doc[0:20]:[1, 778, 128, 74, 12, 630, 163, 15, 4, 1766, 2, 1051, 2, 32, 85, 156, 45, 40, 148, 139]
[6] 	 label:1 	doc length:123    doc[0:20]:[1, 6740, 365, 1234, 5, 1156, 354, 11, 14, 5327, 6638, 7, 1016, 2, 5940, 356, 44, 4, 1349, 500]
[7] 	 label:0 	doc length:562    doc[0:20]:[1, 4, 2, 716, 4, 65, 7, 4, 689, 

In [108]:
train_label[0:20]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1])

## Step2.3: Crop Data

In [159]:
# length before
print("length before crop")
print(len(train_data))
print(len(train_label))
print(len(test_data))
print(len(test_label))

rows_to_keep = Limited

train_data = train_data[:rows_to_keep]
test_data = test_data[:rows_to_keep]
train_label = train_label[:rows_to_keep]
test_label = test_label[:rows_to_keep]

# length After
print("length after crop")
print(len(train_data))
print(len(train_label))
print(len(test_data))
print(len(test_label))

length before crop
25000
25000
25000
25000
length after crop
7000
7000
7000
7000


# Step3: Create Word Counting Table (Data)

In [110]:
df = pd.DataFrame({'data': train_data, 'label': train_label})

df = df.explode('data')

word_counts_data = df.groupby(['data', 'label']).size().unstack(fill_value=0)

word_counts_data

label,0,1
data,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3451,3549
2,61455,69595
4,45140,49817
5,20629,25585
6,22475,23797
...,...,...
6995,9,7
6996,4,9
6997,3,9
6998,7,15


# Step4: Calculate Probabilities Parameters (Data)

In [111]:
total_count_group0 = word_counts_data[0].sum()
total_count_group1 = word_counts_data[1].sum()

flattened_data = [number for sublist in train_data for number in sublist]
unique_words_count = len(set(flattened_data))

print("total count group 0 : " + str(total_count_group0))
print("total count group 1 : " + str(total_count_group1))
print("unique words count : " + str(unique_words_count))

total count group 0 : 819481
total count group 1 : 863750
unique words count : 6998


# Step5: Create Probabilities Table (Data)

In [112]:
probabilities_table = word_counts_data.copy()

probabilities_table += 1

probabilities_table[0] /= (total_count_group0 + unique_words_count)
probabilities_table[1] /= (total_count_group1 + unique_words_count)

probabilities_table

label,0,1
data,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.004177,0.004077
2,0.074359,0.079927
4,0.054618,0.057213
5,0.024961,0.029384
6,0.027195,0.027331
...,...,...
6995,0.000012,0.000009
6996,0.000006,0.000011
6997,0.000005,0.000011
6998,0.000010,0.000018


# Step6: Create Confusion Matrix

## Step6.1: Create initial Matrix

In [113]:
confusion_matrix = pd.DataFrame({
    'Data': test_data,
    'Label': test_label
})

confusion_matrix

Unnamed: 0,Data,Label
0,"[1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 5,...",0
1,"[1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 267...",1
2,"[1, 111, 748, 4368, 1133, 2, 2, 4, 87, 1551, 1...",1
3,"[1, 13, 1228, 119, 14, 552, 7, 20, 190, 14, 58...",0
4,"[1, 40, 49, 85, 84, 1040, 146, 6, 783, 254, 43...",1
...,...,...
6995,"[1, 51, 70, 30, 301, 44, 31, 7, 4, 833, 2, 162...",1
6996,"[1, 4, 20, 2100, 19, 2, 2, 5, 2234, 2, 9, 22, ...",1
6997,"[1, 13, 161, 66, 79, 14, 20, 88, 146, 24, 49, ...",0
6998,"[1, 14, 9, 6, 2812, 5436, 2, 22, 31, 15, 186, ...",1


## Step6.2: Add Pridicted Column

In [114]:
def calculate_predicted_label(row, probabilities_table):
    labels = row['Data']

    group0_product = 1
    group1_product = 1

    for label in labels:
        group0_prob = probabilities_table.loc[label, 0]
        group1_prob = probabilities_table.loc[label, 1]

        group0_product *= group0_prob
        group1_product *= group1_prob

    test_label_series = pd.Series(test_label)
    counts = test_label_series.value_counts()

    if label == 0:
        group0_product *= (counts[0]/Limited)
    elif label == 1:
        group1_product *= (counts[1]/Limited)

    predicted_label = 0 if group0_product > group1_product else 1

    return predicted_label

confusion_matrix['predicted_label'] = confusion_matrix.apply(lambda row: calculate_predicted_label(row, probabilities_table), axis=1)

confusion_matrix

Unnamed: 0,Data,Label,predicted_label
0,"[1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 5,...",0,0
1,"[1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 267...",1,1
2,"[1, 111, 748, 4368, 1133, 2, 2, 4, 87, 1551, 1...",1,1
3,"[1, 13, 1228, 119, 14, 552, 7, 20, 190, 14, 58...",0,1
4,"[1, 40, 49, 85, 84, 1040, 146, 6, 783, 254, 43...",1,1
...,...,...,...
6995,"[1, 51, 70, 30, 301, 44, 31, 7, 4, 833, 2, 162...",1,1
6996,"[1, 4, 20, 2100, 19, 2, 2, 5, 2234, 2, 9, 22, ...",1,1
6997,"[1, 13, 161, 66, 79, 14, 20, 88, 146, 24, 49, ...",0,1
6998,"[1, 14, 9, 6, 2812, 5436, 2, 22, 31, 15, 186, ...",1,1


## Step6.3: Calculate the result of Confusion Matrix

In [115]:
confusion_matrix['result'] = ''
conditions = [
    (confusion_matrix['Label'] == 1) & (confusion_matrix['predicted_label'] == 1),  # True Positive (TP)
    (confusion_matrix['Label'] == 0) & (confusion_matrix['predicted_label'] == 0),  # True Negative (TN)
    (confusion_matrix['Label'] == 0) & (confusion_matrix['predicted_label'] == 1),  # False Positive (FP)
    (confusion_matrix['Label'] == 1) & (confusion_matrix['predicted_label'] == 0)   # False Negative (FN)
]

choices = ['TP', 'TN', 'FP', 'FN']

confusion_matrix['result'] = np.select(conditions, choices, default='')

confusion_matrix

Unnamed: 0,Data,Label,predicted_label,result
0,"[1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 5,...",0,0,TN
1,"[1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 267...",1,1,TP
2,"[1, 111, 748, 4368, 1133, 2, 2, 4, 87, 1551, 1...",1,1,TP
3,"[1, 13, 1228, 119, 14, 552, 7, 20, 190, 14, 58...",0,1,FP
4,"[1, 40, 49, 85, 84, 1040, 146, 6, 783, 254, 43...",1,1,TP
...,...,...,...,...
6995,"[1, 51, 70, 30, 301, 44, 31, 7, 4, 833, 2, 162...",1,1,TP
6996,"[1, 4, 20, 2100, 19, 2, 2, 5, 2234, 2, 9, 22, ...",1,1,TP
6997,"[1, 13, 161, 66, 79, 14, 20, 88, 146, 24, 49, ...",0,1,FP
6998,"[1, 14, 9, 6, 2812, 5436, 2, 22, 31, 15, 186, ...",1,1,TP


# Step7: Calculate Measurments

In [116]:
TP = confusion_matrix[confusion_matrix['result'] == 'TP'].shape[0]
FP = confusion_matrix[confusion_matrix['result'] == 'FP'].shape[0]
TN = confusion_matrix[confusion_matrix['result'] == 'TN'].shape[0]
FN = confusion_matrix[confusion_matrix['result'] == 'FN'].shape[0]

precision = TP / (TP + FP) if TP + FP!= 0 else 0
recall = TP / (TP + FN) if TP + FN!= 0 else 0
accuracy = (TP + TN) / (TP + TN + FP + FN) if TP + TN + FP + FN!= 0 else 0

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

Precision: 0.5280718639717678
Recall: 0.9550333623440673
Accuracy: 0.5575714285714286


# Step8: Calculate Gaussian Naive Bayes with Model

## Step8.1: Calclate the Confusion Matrix

In [160]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

nb = GaussianNB()

vectorizer = CountVectorizer(binary=True)
train_data_matrix = vectorizer.fit_transform([" ".join([str(i) for i in review]) for review in train_data])
test_data_matrix = vectorizer.transform([" ".join([str(i) for i in review]) for review in test_data])

nb.fit(train_data_matrix.toarray(), train_label)

predicted_labels = nb.predict(test_data_matrix.toarray())

cm = confusion_matrix(test_label, predicted_labels)
results = np.where(test_label == predicted_labels, 'TN', np.where(predicted_labels == 1, 'FP', 'FN'))

data = {'Data': test_data, 'Label': test_label, 'predicted_label': predicted_labels, 'result': results}
confusion_matrix_df = pd.DataFrame(data)

confusion_matrix_df

Unnamed: 0,Data,Label,predicted_label,result
0,"[1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 5,...",0,0,TN
1,"[1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 267...",1,1,TN
2,"[1, 111, 748, 4368, 1133, 2, 2, 4, 87, 1551, 1...",1,1,TN
3,"[1, 13, 1228, 119, 14, 552, 7, 20, 190, 14, 58...",0,0,TN
4,"[1, 40, 49, 85, 84, 1040, 146, 6, 783, 254, 43...",1,0,FN
...,...,...,...,...
6995,"[1, 51, 70, 30, 301, 44, 31, 7, 4, 833, 2, 162...",1,0,FN
6996,"[1, 4, 20, 2100, 19, 2, 2, 5, 2234, 2, 9, 22, ...",1,1,TN
6997,"[1, 13, 161, 66, 79, 14, 20, 88, 146, 24, 49, ...",0,0,TN
6998,"[1, 14, 9, 6, 2812, 5436, 2, 22, 31, 15, 186, ...",1,0,FN


## Step8.2: Calculate Measurments

In [161]:
precision = precision_score(test_label, predicted_labels)
recall = recall_score(test_label, predicted_labels)
accuracy = accuracy_score(test_label, predicted_labels)

print("\nPrecision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


Precision: 0.7789203084832905
Recall: 0.5274151436031331
Accuracy: 0.6935714285714286
