In [1]:
import numpy as np
import pandas as pd
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
#Extracting data and preprocessing it
data = pd.read_excel("azeri_news.xlsx")
data["News_Article"] = data["Title"] + data["News_Article"] #I decided to concatenate both columns, cause we can use information
                                                            #from the title and article and they will be equally useful 
                                                            #in training our model(maybe title will be more useful)
data.head()

Unnamed: 0,Category,Title,News_Article
0,Maraqlı,Naviforce Sport Saat 2016 ilə zövqlərin ahəngi,Naviforce Sport Saat 2016 ilə zövqlərin ahəngi...
1,Maraqlı,"Sinir ,oynaq , sinir bel ağrılarına 3 gündə son !","Sinir ,oynaq , sinir bel ağrılarına 3 gündə so..."
2,Maraqlı,Dəyərindən qat-qat aşağı qiymətə Mənzil,Dəyərindən qat-qat aşağı qiymətə MənzilDəyərin...
3,İdman,2024 və 2028-ci il olimpiadalarının keçiriləcə...,2024 və 2028-ci il olimpiadalarının keçiriləcə...
4,Dünya,Türkiyədə zəlzələ,Türkiyədə zəlzələ Türkiyədə daha bir zəlzələ ...


In [3]:
#Splitting data(10% test, 90% train)
X_train, X_test, Y_train, Y_test = train_test_split(data['News_Article'], data['Category'], test_size = 0.1)

In [4]:
#Encoding labels
Encoder = LabelEncoder()
Y_train = Encoder.fit_transform(Y_train)
Y_test = Encoder.fit_transform(Y_test)
print(Y_train.shape)
print(Y_test.shape)

(45000,)
(5000,)


In [5]:
count_vect = CountVectorizer(max_features = 1000)
count_vect.fit(data['News_Article'])
X_train_counts = count_vect.transform(X_train)
print(X_train_counts.shape)
X_test_counts = count_vect.transform(X_test)
print(X_test_counts.shape)

(45000, 1000)
(5000, 1000)


In [6]:
tfidf_vect = TfidfVectorizer(max_features = 1000)
tfidf_vect.fit(data['News_Article'])
X_train_tfidf = tfidf_vect.transform(X_train)
print(X_train_tfidf.shape)
X_test_tfidf = tfidf_vect.transform(X_test)
print(X_test_tfidf.shape)
#X_train_tfidf.shape

(45000, 1000)
(5000, 1000)


In [7]:
classifier_counts = LinearSVC(max_iter = 2000)
classifier_counts.fit(X_train_counts, Y_train)
predictions_SVM_counts = classifier_counts.predict(X_test_counts)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_counts, Y_test)*100)
print("For test data:", accuracy_score(Y_test, predictions_SVM_counts))
print(confusion_matrix(Y_test, predictions_SVM_counts))
print(classification_report(Y_test, predictions_SVM_counts))

SVM Accuracy Score ->  80.47999999999999
For test data: 0.8048
[[1087  186    4   24   22   16]
 [ 199 1218   18   46   54   55]
 [  12   33   97    6    0    6]
 [  54   39    6  485    4   26]
 [  13   32    2    6  581    1]
 [  25   59    6   21    1  556]]
              precision    recall  f1-score   support

           0       0.78      0.81      0.80      1339
           1       0.78      0.77      0.77      1590
           2       0.73      0.63      0.68       154
           3       0.82      0.79      0.81       614
           4       0.88      0.91      0.90       635
           5       0.84      0.83      0.84       668

    accuracy                           0.80      5000
   macro avg       0.81      0.79      0.80      5000
weighted avg       0.80      0.80      0.80      5000





In [8]:
classifier_tfidf = LinearSVC(max_iter = 2000)
classifier_tfidf.fit(X_train_tfidf, Y_train)
predictions_SVM_tfidf = classifier_tfidf.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_tfidf, Y_test)*100)
print("For test data:", accuracy_score(Y_test, predictions_SVM_tfidf)) 
print(confusion_matrix(Y_test, predictions_SVM_tfidf))
print(classification_report(Y_test, predictions_SVM_tfidf))

SVM Accuracy Score ->  81.3
For test data: 0.813
[[1097  169    3   29   22   19]
 [ 202 1220    8   49   56   55]
 [  12   40   88   11    0    3]
 [  46   30    5  506    5   22]
 [  11   25    2    6  589    2]
 [  20   55    3   25    0  565]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      1339
           1       0.79      0.77      0.78      1590
           2       0.81      0.57      0.67       154
           3       0.81      0.82      0.82       614
           4       0.88      0.93      0.90       635
           5       0.85      0.85      0.85       668

    accuracy                           0.81      5000
   macro avg       0.82      0.79      0.80      5000
weighted avg       0.81      0.81      0.81      5000



In [9]:
clf = MLPClassifier(hidden_layer_sizes=(30,30,30), max_iter=200, alpha=0.0001,
                     solver='adam', verbose=True,  random_state=22,tol=0.000000001)

In [10]:
clf.fit(X_train_counts, Y_train)

Iteration 1, loss = 0.88493550
Iteration 2, loss = 0.54891348
Iteration 3, loss = 0.48867748
Iteration 4, loss = 0.45367481
Iteration 5, loss = 0.42726627
Iteration 6, loss = 0.40281871
Iteration 7, loss = 0.38352201
Iteration 8, loss = 0.36496659
Iteration 9, loss = 0.34795978
Iteration 10, loss = 0.33324766
Iteration 11, loss = 0.31935038
Iteration 12, loss = 0.30530593
Iteration 13, loss = 0.29091647
Iteration 14, loss = 0.28004116
Iteration 15, loss = 0.26690954
Iteration 16, loss = 0.25938958
Iteration 17, loss = 0.24812894
Iteration 18, loss = 0.23738909
Iteration 19, loss = 0.22801208
Iteration 20, loss = 0.21908259
Iteration 21, loss = 0.21214904
Iteration 22, loss = 0.20417778
Iteration 23, loss = 0.19854043
Iteration 24, loss = 0.18830037
Iteration 25, loss = 0.18434766
Iteration 26, loss = 0.17468449
Iteration 27, loss = 0.17117454
Iteration 28, loss = 0.17054192
Iteration 29, loss = 0.16348558
Iteration 30, loss = 0.15467849
Iteration 31, loss = 0.14767959
Iteration 32, los

MLPClassifier(hidden_layer_sizes=(30, 30, 30), random_state=22, tol=1e-09,
              verbose=True)

In [11]:
clf_train_predict = clf.predict(X_train_counts)
print("For train data accuracy: ", accuracy_score(Y_train, clf_train_predict)) 

clf_test_predict = clf.predict(X_test_counts)
print("For test data:", accuracy_score(Y_test, clf_test_predict))
print(confusion_matrix(Y_test, clf_test_predict))
print(classification_report(Y_test, clf_test_predict))

For train data accuracy:  0.9832888888888889
For test data: 0.7726
[[1019  216    7   50   28   19]
 [ 224 1172   35   65   55   39]
 [  13   28   95   13    2    3]
 [  40   42    8  500    5   19]
 [  23   49    4    7  548    4]
 [  24   66    6   41    2  529]]
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      1339
           1       0.75      0.74      0.74      1590
           2       0.61      0.62      0.61       154
           3       0.74      0.81      0.78       614
           4       0.86      0.86      0.86       635
           5       0.86      0.79      0.83       668

    accuracy                           0.77      5000
   macro avg       0.76      0.76      0.76      5000
weighted avg       0.77      0.77      0.77      5000



In [12]:
clf.fit(X_train_tfidf, Y_train)

Iteration 1, loss = 1.07966302
Iteration 2, loss = 0.57236999
Iteration 3, loss = 0.51245798
Iteration 4, loss = 0.48778357
Iteration 5, loss = 0.47178633
Iteration 6, loss = 0.45865035
Iteration 7, loss = 0.44640049
Iteration 8, loss = 0.43551139
Iteration 9, loss = 0.42688572
Iteration 10, loss = 0.41705751
Iteration 11, loss = 0.40804892
Iteration 12, loss = 0.40006795
Iteration 13, loss = 0.39221361
Iteration 14, loss = 0.38504290
Iteration 15, loss = 0.37743772
Iteration 16, loss = 0.37134917
Iteration 17, loss = 0.36363814
Iteration 18, loss = 0.35719614
Iteration 19, loss = 0.35050869
Iteration 20, loss = 0.34310440
Iteration 21, loss = 0.33745303
Iteration 22, loss = 0.33049847
Iteration 23, loss = 0.32493394
Iteration 24, loss = 0.31812837
Iteration 25, loss = 0.31295215
Iteration 26, loss = 0.30463468
Iteration 27, loss = 0.29928226
Iteration 28, loss = 0.29460489
Iteration 29, loss = 0.28687591
Iteration 30, loss = 0.28031155
Iteration 31, loss = 0.27544130
Iteration 32, los



MLPClassifier(hidden_layer_sizes=(30, 30, 30), random_state=22, tol=1e-09,
              verbose=True)

In [13]:
clf_train_predict = clf.predict(X_train_tfidf)
print("For train data accuracy: ", accuracy_score(Y_train, clf_train_predict)) 

clf_test_predict = clf.predict(X_test_tfidf)
print("For test data:", accuracy_score(Y_test, clf_test_predict))
print(confusion_matrix(Y_test, clf_test_predict))
print(classification_report(Y_test, clf_test_predict))

For train data accuracy:  0.9862
For test data: 0.7744
[[1003  230    5   47   31   23]
 [ 196 1194   27   56   66   51]
 [  11   32   97    7    1    6]
 [  42   50    9  482    5   26]
 [  19   51    2    7  553    3]
 [  21   67    5   29    3  543]]
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      1339
           1       0.74      0.75      0.74      1590
           2       0.67      0.63      0.65       154
           3       0.77      0.79      0.78       614
           4       0.84      0.87      0.85       635
           5       0.83      0.81      0.82       668

    accuracy                           0.77      5000
   macro avg       0.77      0.77      0.77      5000
weighted avg       0.77      0.77      0.77      5000

