In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix

In [2]:
# preluat din lab4
class Bag_of_words:
    def __init__(self):
        self.vocabulary = {}
        self.vocabulary_length = 0

    def build_vocabulary(self, data):
        for document in data: # text din lista de texte
            for word in document:
                if word not in self.vocabulary.keys():  # creeaza un dictionar in care cheile sunt cuvintele 
                    self.vocabulary[word] = len(self.vocabulary)
        self.vocabulary_length = len(self.vocabulary)
        
    # returneaza vectorul in care cuvintelor le-a fost asignat un index, iar ca valoare s-a pus frecventa acestora in seturile de antrenare, validare si testare, pe baza vocabularului din setul de antrenare    
    def get_features(self, data):
        features = np.zeros((len(data), self.vocabulary_length))
        for document_idx, document in enumerate(data): # id-ul textului este pus ca indice de linie in vectorul de feature-uri => cate texte atatea linii
            for word in document:                      # coloanele reprezinta fiecare cuvant din vocabularul de pe setul de antrenare
                if word in self.vocabulary.keys():
                    features[document_idx, self.vocabulary[word]] += 1 # valoarea de la acest cuvant din acest text este frecventa de aparitie
        return features

In [3]:
# preluat din lab4
def accuracy_score(true_labels, predicted_labels):
    return (true_labels==predicted_labels).mean()

In [4]:
# preluat din lab4
def normalize_data(train_data, test_data, type=None):
    scaler = None
    if type == 'l1':
        scaler = preprocessing.Normalizer(norm='l1')

    elif type == 'l2':
        scaler = preprocessing.Normalizer(norm='l2')

    if scaler is not None:
        scaler.fit(train_data)
        scaled_train_data = scaler.transform(train_data)
        scaled_test_data = scaler.transform(test_data) 
        return (scaled_train_data, scaled_test_data)
    else:
        print("No scaling was performed. Raw data is returned.")
        return (train_data, test_data)

In [5]:
train_sample_stuff = pd.read_csv('data/train_samples.txt', sep='	', header=None)
# retin doar textele intr-un array (fara id-ul de la inceput)
train_samples = train_sample_stuff[1]

train_label_stuff = pd.read_csv('data/train_labels.txt', sep = '\t', header=None)
# retin doar label-urile intr-un array (fara id-ul de la inceput)
train_labels = train_label_stuff[1]

# retin o lista de cuvinte pentru intregul set de date de antrenare, pe care o voi folosi in Bag_of_words pentru a obtine frecventa fiecaruia in set
train_list = [word.split() for word in train_samples]

In [8]:
validation_sample_stuff = pd.read_csv('data/validation_samples.txt', sep='	', header=None)
validation_samples = validation_sample_stuff[1]
validation_label_stuff = pd.read_csv('data/validation_labels.txt', sep='	', header=None)
validation_labels = validation_label_stuff[1]
validation_list = [word.split() for word in validation_samples]

In [11]:
test_sample_stuff = pd.read_csv('data/test_samples.txt', sep='	', header=None)
test_samples = test_sample_stuff[1]
test_list = [word.split() for word in test_samples]

In [13]:
# construieste vocabularul pe baza frecventei de aparitie a cuvintelor in textele din setul de antrenare
bow_model = Bag_of_words()
bow_model.build_vocabulary(train_list) 

In [14]:
# salvez vectorul de frecvente pentru fiecare cuvant
train_features = bow_model.get_features(train_list)
validation_features = bow_model.get_features(validation_list)
test_features = bow_model.get_features(test_list)

In [15]:
# pentru unele teste (ineficiente) am folosit normalizarea
# scaled_train_data, scaled_validation_data = normalize_data(train_features, validation_features, type='l2')
scaled_train_data = train_features
scaled_validation_data = validation_features

In [None]:
# definesc modelul Support-Vector Classification din biblioteca Masinilor de Vectori Suport
svm_model = svm.SVC(C=12, kernel='rbf')

# antrenez modelul pe setul de training
svm_model.fit(scaled_train_data, train_labels)

# prezic label-urile pe setul de validare pentru a vedea performanta modelului, intrucat stiu deja label-urile corecte
predicted_labels_svm = svm_model.predict(scaled_validation_data)
model_accuracy_svm = accuracy_score(np.asarray(validation_labels), predicted_labels_svm)
print(" => accuracy: ", model_accuracy_svm * 100)

# prezic label-urile si pe setul de testare
predicted_test_labels_svm = svm_model.predict(test_features) 

In [None]:
# SCRIEREA LABEL-URILOR IN FISIERUL TEXT DE SUBMISIE

# x este lista pe care o voi scrie in fisierul de submisie
x = [['id','label']]
test_ids = test_sample_stuff[0]

for i in range(len(test_sample_stuff)):
    row = []
    predicted_label = predicted_test_labels_svm[i]
    row.append(test_ids[i])
    row.append(predicted_label)
    x.append(row)

np.savetxt('submisii/sample_submission.txt', x, delimiter=',', newline='\n', fmt='%s')