In [2]:
# Simon Dudtschak
# 20166103
#
# Based on: SOM-CNN by Ashraf Neisari
# Available: https://github.com/Ashsari/spam_review_detection

In [3]:
import pandas as pd
import numpy as np
import random
import time

from preprocessing import Preprocessing
from Clustering_words_SOM import Clustering
from Constructing_images_classification import Classify
import matplotlib.pyplot as plt

from SRD_minisom import MiniSom

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import collections
from itertools import chain

In [4]:
rand = random.randint(0,10000)
np.random.seed(rand)
random.seed(rand)

globalTime = time.perf_counter()

In [5]:
##############################################################
##                      PRE-PROCESSING                      ##
##############################################################
print("\nPRE-PROCESSING\n")

#importing the dataset
#@EXTENSION
dataset = pd.read_csv('final_mfrc_data.csv')
reviews = dataset.iloc[:,0]

#get labels and convert them to categorical format
list_words = []
for row in dataset["annotation"]:
    words = row.split(',')
    word_counts = collections.Counter(words)
    list_words.append(words)
    flat_list = list(chain.from_iterable(list_words))
unique_words = set(flat_list)


PRE-PROCESSING



In [10]:
#get labels and convert them to categorical format
y = dataset.iloc[:,3]
le = LabelEncoder()
y = le.fit_transform(y)
label = to_categorical(y, num_classes = 6)
label

array([[0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.]], dtype=float32)

In [14]:
#cleaning the vocabulary and tokenizing the reviews
print("Cleaning")
vocabulary, numWords, review_vocab, cleaned_reviews = Preprocessing.cleaning_reviews(reviews)

# keeping only unique words     
vocabulary = list(set(vocabulary))

# Here we provide the pretrained word embedding dictionary of glove with the vector dimension of 300
embedding_dim = 300
filepath ='glove.6B.300d.txt'

Cleaning


In [16]:
#Here we construct the embedding matrix of the fetched vocabulary from the reviews. 
#Each row of the matrix represents the corresponding word embedding vector
print("Embedding")
embedding_matrix = Preprocessing.create_embedding_matrix(filepath, vocabulary, embedding_dim )

#We fetch the list of the words that are not covered by the pretrained dictionary and later eliminate them from the vocabulary list  
nonzero_elements, not_foundWords, not_foundIndex = Preprocessing.not_found_words(embedding_matrix, vocabulary)


Embedding
The percentage of the covered words in the pretrained dictionary:  0.8740046287216099
number of not found words is:  3212


In [18]:
# Feature scaling make all features(the vectors values) between 0 and 1
print("Scaling")
X = embedding_matrix
sc = MinMaxScaler(feature_range = (0, 1))
X = sc.fit_transform(X)

Scaling


In [22]:
X.shape

(25493, 300)

In [28]:
#calculate the TF-IDF of the vocabularies from the cleaned reviews 
df_TF_IDF, missed_tfidfW = Preprocessing.words_frequency(cleaned_reviews, vocabulary)



MemoryError: Unable to allocate 11.6 GiB for an array with shape (61226, 25457) and data type float64

In [None]:
##########################################################
##                      CLUSTERING                      ##
##########################################################
print("\nCLUSTERING\n")


#define the grid size for the SOM map by g.
#@EXTENSION
g = 50
X_of_map = g
y_of_map = g

#the input length passed to the SOM is the embedding vectors dimension 
input_len = X.shape[1]

#defining the neghborhood radius sigma as s and the learning rate
s = 1; l = 1 # the radius

#We create a SOM object and train it on the word vectors that were prepared in the pre-processing step
som = MiniSom(x = X_of_map, y= y_of_map, input_len = input_len, sigma = s, learning_rate = l, random_seed = 8)
#Randomly initialize the weights that are assigne to each grid cells
som.random_weights_init(X)

#trining the SOM
print("SOM Training")
#@EXTENSION
som.train_batch(data = X, num_iteration = 1000)



#separating ham and spam reviews from in the dataset
print("Seperate")
ham , spam =Clustering.separate_ham_spam(reviews, y)
#getting vocabulary list that are used in ham and spam reviews

print("Cleaning")
vocabulary_ham, numWords_ham, review_vocab_ham, cleaned_reviews_ham = Preprocessing.cleaning_reviews(ham)
vocabulary_spam, numWords_spam, review_vocab_spam, cleaned_reviews_spam = Preprocessing.cleaning_reviews(spam)
vocabulary_ham = list(set(vocabulary_ham))
vocabulary_spam = list(set(vocabulary_spam))

#creating a list of words that are common among the ham and spam reviews
matches = list(set(vocabulary_spam) & set(vocabulary_ham))

In [1]:
#fetching the winning nodes of the word vectors from the SOM grid map
print("Calculate Win Map")
mappings = som.win_map(X)

Calculate Win Map


NameError: name 'som' is not defined

In [None]:
##############################################################
##                      CLASSIFICATION                      ##
##############################################################
print("\nCLASSIFICATION\n")

#mappings = som.win_map(X)
map_cells = Classify.words_win_cel(mappings, vocabulary) 

#constructing the review images based on the feature word density
print("Create Density Image")
dense_img = Classify.create_dens_img(review_vocab, map_cells, vocabulary, not_foundWords, X_of_map, y_of_map)

In [None]:
#constructing the review images based on the feature word frequency 
print("Create Frequency Image")
frequency_img= Classify.create_freq_img(review_vocab, map_cells, vocabulary, missed_tfidfW, not_foundWords, X_of_map, y_of_map, df_TF_IDF)    



#reshaping the images to make them ready for the CNN
review_images = Classify.reshape_img(dense_img, frequency_img) 

#splitting the data to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(review_images ,label, test_size=0.2,random_state= 252)
print("Create Model")
model = Classify.create_model(g, X_train, y_train, X_test, y_test)

In [None]:
#getting the evaluation performance of the model
loss, tr_accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(tr_accuracy)) # 
loss, te_accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(te_accuracy)) # 
print('\n')

pred = (model.predict(X_test) > 0.5).astype("int32")
predict_classes = np.argmax(pred, axis=1)
expected_classes = np.argmax(y_test, axis = 1)
#print the classification report
evaluation = classification_report(expected_classes, predict_classes)
print(evaluation)

#plotting ROC value of the result
matrix = confusion_matrix(expected_classes, predict_classes)
ROC_value, fprx, tpry = Classify.ROC_Val(matrix)

globalTime = time.perf_counter() - globalTime
print("Total Time: ", globalTime)