In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import nltk
from nltk import word_tokenize
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [51]:
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

### Učitavanje prethodno sačuvanih podatka

In [8]:
data = pd.read_csv('csv/data_filtered.csv')
print(data.shape)

(159412, 8)


In [9]:
balanced_data = pd.read_csv('csv/data_balanced.csv')
print(balanced_data.shape)

(31503, 8)


###  Podela na trening i test skup 
### Izdvajanje atributa - feature engineering
https://www.youtube.com/watch?v=YyOuDi-zSiI

https://medium.com/technovators/machine-learning-based-multi-label-text-classification-9a0e17f88bb4

Koristićemo TF-IDF reprezentaciju

Funkcija koja vrši podelu na trening i test skup, i nakon toga kreira vokabular i transformiše tekst:

In [10]:
def split_and_vectorize(data, max_features, name):
    data_text = data['comment_text'].values.astype('U')
    X = data.drop(labels=['id'], axis=1)
    y = data.drop(labels = ['id', 'comment_text'], axis=1)
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33, random_state=42)
    
    vectorizer = TfidfVectorizer(strip_accents='unicode', tokenizer=word_tokenize, analyzer='word', ngram_range=(1,3), norm='l2', max_features = max_features)
   
    X_train_vec = vectorizer.fit_transform(X_train['comment_text'])
    X_test_vec = vectorizer.transform(X_test['comment_text'])

   # pickle.dump(vectorizer, open('tfidf_vectorizer_' + name + '.pickle', "wb"))
    return X_train_vec, X_test_vec, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = split_and_vectorize(data, 10000, 'data')

In [13]:
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = split_and_vectorize(balanced_data, 20000, 'balanced')

In [14]:
vectorizer_data = pickle.load(open("tfidf_vectorizer_data.pickle", "rb"))
vectorizer_balanced = pickle.load(open("tfidf_vectorizer_balanced.pickle", "rb"))

In [19]:
features1 = vectorizer_data.get_feature_names()
features1[2020:2030]

['blog ,',
 'blood',
 'bloody',
 'bloody fool',
 'bloody fool bloody',
 'blow',
 'blp',
 'blue',
 'bnp',
 'board']

In [16]:
features2 = vectorizer_balanced.get_feature_names()
features2[2020:2025]

['abuse admin', 'abuse power', 'abusive', 'ac', 'academ']

## Igradnja modela

Postoje tri pristupa kada je u pitanju multilaberna klasifikacija

### Transformacija problema
where we divide the multi-label problem into one or more conventional single-label problems

- **Binary Relevance**: treats each label as a separate single class classification

- **Classifier Chains**: In this, the first classifier is trained just on the input data and then each next classifier is trained on the input space and all the previous classifiers in the chain.

- **Label Powerset**:we transform the problem into a multi-class problem with one multi-class classifier is trained on all unique label combinations found in the training data.

### Adaptacija problema
Some classification algorithms/models like (knn, Decision trees) have been adapted to the multi-label task, without requiring problem transformations.

### Ansabli


Funkcija koja računa metrike za napravljen model i daje izveštaj i matricu konfuzije:

In [29]:
def get_metrics(y_test, y_predicted, name):
    accuracy = metrics.accuracy_score(y_test, y_predicted)
    precision = metrics.precision_score(y_test, y_predicted, average='weighted', labels=np.unique(y_predicted))
    recall = metrics.recall_score(y_test, y_predicted, average='weighted', labels=np.unique(y_predicted))
    f1 = metrics.f1_score(y_test, y_predicted, average='weighted', labels=np.unique(y_predicted))
    auc = metrics.roc_auc_score(y_test, y_predicted)
    hamming_loss = metrics.hamming_loss(y_test, y_predicted)

    print(name + ':\n')
    
    print('Tačnost: {}'.format(accuracy))
    print('Preciznost: {}'.format(precision))
    print('Odziv: {}'.format(recall))
    print('F1 mera: {}'.format(f1))
    print('AUC vrednost:{}'.format(auc))
    print('Haming vrednost: {}'.format(hamming_loss))
    
    report = metrics.classification_report(y_test, y_predicted)
    confusion_matrix = multilabel_confusion_matrix(y_test, y_predicted)
    print('\nIzvestaj:\n {}'.format(report))
    print('\n')
    print('Matrica konfuzije:\n {}'.format(confusion_matrix))

In [21]:


clf = OneVsRestClassifier(LinearSVC(), n_jobs=1)

In [22]:
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1)

In [23]:
y_predicted = clf.predict(X_test)

In [30]:
get_metrics(y_test, y_predicted, 'All data')

All data:

Tačnost: 0.917804052769646
Preciznost: 0.8306549461122665
Odziv: 0.6268122427062824
F1 mera: 0.7129711238500114
AUC vrednost:0.7172989596547032
Haming vrednost: 0.018651231165012862

Izvestaj:
               precision    recall  f1-score   support

           0       0.87      0.67      0.75      5066
           1       0.48      0.23      0.31       521
           2       0.89      0.70      0.79      2795
           3       0.61      0.18      0.28       156
           4       0.80      0.56      0.66      2647
           5       0.62      0.29      0.39       453

   micro avg       0.84      0.61      0.71     11638
   macro avg       0.71      0.44      0.53     11638
weighted avg       0.83      0.61      0.70     11638
 samples avg       0.06      0.05      0.05     11638



Matrica konfuzije:
 [[[47020   520]
  [ 1685  3381]]

 [[51954   131]
  [  400   121]]

 [[49570   241]
  [  827  1968]]

 [[52432    18]
  [  128    28]]

 [[49581   378]
  [ 1157  1490]]

 [[520

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
clf1 = OneVsRestClassifier(LinearSVC(), n_jobs=1)

In [36]:
clf1.fit(X_train_balanced, y_train_balanced)

OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1)

In [37]:
y_predicted_balanced = clf1.predict(X_test_balanced)

In [38]:
get_metrics(y_test_balanced, y_predicted_balanced, 'Balanced data')

Balanced data:

Tačnost: 0.6341862254713351
Preciznost: 0.8353661823381402
Odziv: 0.7825462381037889
F1 mera: 0.8046278960498456
AUC vrednost:0.7265086488906104
Haming vrednost: 0.0819545979222778

Izvestaj:
               precision    recall  f1-score   support

           0       0.87      0.84      0.86      5046
           1       0.48      0.24      0.32       523
           2       0.86      0.75      0.80      2767
           3       0.63      0.25      0.36       149
           4       0.74      0.62      0.68      2631
           5       0.62      0.29      0.39       448

   micro avg       0.82      0.71      0.76     11564
   macro avg       0.70      0.50      0.57     11564
weighted avg       0.81      0.71      0.75     11564
 samples avg       0.38      0.35      0.35     11564



Matrica konfuzije:
 [[[ 4727   623]
  [  811  4235]]

 [[ 9742   131]
  [  400   123]]

 [[ 7280   349]
  [  699  2068]]

 [[10225    22]
  [  112    37]]

 [[ 7196   569]
  [  997  1634]]

 [

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# Chain sa SVM nije uspeo zbog memoriije



#clf_chain = ClassifierChain(LinearSVC())
#clf_chain.fit(X_train, y_train)

In [44]:
clf_balanced_chain = ClassifierChain(LinearSVC())
clf_balanced_chain.fit(X_train_balanced, y_train_balanced)

ClassifierChain(classifier=LinearSVC(), require_dense=[True, True])

In [45]:
y_predicted = clf_balanced_chain.predict(X_test_balanced)

In [53]:
rfClassifier = RandomForestClassifier(n_jobs=-1)
rfClassifier.fit(X_train_balanced, y_train_balanced)
rfPreds = rfClassifier.predict(X_test_balanced)

In [54]:
get_metrics(y_test_balanced, rfPreds, 'Random forest balanced')

Random forest balanced:

Tačnost: 0.6217776067718354
Preciznost: 0.8640832457540458
Odziv: 0.6922248159454121
F1 mera: 0.7569885813911984
AUC vrednost:0.6681784987749627
Haming vrednost: 0.08742144414518405

Izvestaj:
               precision    recall  f1-score   support

           0       0.90      0.76      0.82      5046
           1       0.51      0.07      0.12       523
           2       0.89      0.68      0.78      2767
           3       0.53      0.06      0.11       149
           4       0.80      0.52      0.63      2631
           5       0.81      0.09      0.16       448

   micro avg       0.87      0.62      0.72     11564
   macro avg       0.74      0.36      0.44     11564
weighted avg       0.85      0.62      0.70     11564
 samples avg       0.35      0.30      0.31     11564



Matrica konfuzije:
 [[[ 4931   419]
  [ 1228  3818]]

 [[ 9837    36]
  [  486    37]]

 [[ 7403   226]
  [  872  1895]]

 [[10239     8]
  [  140     9]]

 [[ 7417   348]
  [ 1272  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
