In [1]:
import numpy as np

def normalize_data(train_data, test_data, type=None):
    if type is None:
        return train_data, test_data
    if type == "standard":
        train_data = (train_data - np.mean(train_data)) / np.std(train_data)
        test_data = (test_data - np.mean(test_data)) / np.std(test_data)
        return train_data, test_data


In [29]:
training_sentences = np.load('training_sentences.npy', allow_pickle=True)
test_sentences = np.load('test_sentences.npy', allow_pickle=True)
training_labels = np.load('training_labels.npy', allow_pickle=True)
test_labels = np.load('test_labels.npy', allow_pickle=True)

In [61]:
class BagOfWords:
    def __init__(self):
        self.vocabulary = dict()
        
    def buildVocabulary(self, data):
        self.vocabulary = dict()
        words = []
        index = 0
        for sentence in data:
            for word in sentence:
                if self.vocabulary.get(word) is None:
                    self.vocabulary[word] = index
                    index += 1
                    words.append(word)
                    
    def get_features(self, data):
        freq = np.zeros((len(data), len(self.vocabulary)))
        for i, sentence in enumerate(data):
            for word in sentence:
                if self.vocabulary.get(word) is not None:
                    freq[i][self.vocabulary[word]] += 1     
                
        return freq
                
        

In [82]:
bow = BagOfWords()
bow.buildVocabulary(training_sentences)
training_features = bow.get_features(training_sentences)
test_features = bow.get_features(test_sentences)
training_features, test_features = normalize_data(training_features, test_features)

In [83]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(training_features, training_labels)
svc.score(test_features, test_labels)

0.9847826086956522

In [84]:

feature_names = np.array(list(bow.vocabulary.keys()))
coef = svc.coef_[0]
top_negative_words = feature_names[coef.argsort()[:10]]
top_positive_words = feature_names[coef.argsort()[-10:][::-1]]
print('Primele 10 cuvinte negative (spam):', top_negative_words)
print('Primele 10 cuvinte pozitive (non-spam):', top_positive_words)

Primele 10 cuvinte negative (spam): ['&lt#&gt' 'him' 'Oh' 'Waiting' 'Alright' 'me' 'Lmaonice' 'always' 'It'
 'goal']
Primele 10 cuvinte pozitive (non-spam): ['84484' 'ringtoneking' 'REAL' 'won' '85233' 'FREE>RingtoneReply'
 'httptms' 'widelivecomindex' 'Txt' 'For']
