In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("20newsgroups.csv")
X = df.iloc[:, 0]
y = df.iloc[:, 1]

In [3]:
print("Broj različitih klasa:", y.nunique())
print("\nDistribucija po klasama:")
print(y.value_counts(normalize=True))

Broj različitih klasa: 20

Distribucija po klasama:
target
10    0.053032
15    0.052943
8     0.052855
9     0.052766
11    0.052590
13    0.052501
7     0.052501
14    0.052413
5     0.052413
2     0.052236
12    0.052236
3     0.052148
6     0.051706
1     0.051617
4     0.051087
17    0.049850
16    0.048259
0     0.042425
18    0.041100
19    0.033322
Name: proportion, dtype: float64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (7919,)
y_train: (7919,)
X_test: (3395,)
y_test: (3395,)


In [6]:
vectorizer = CountVectorizer()
tf_train = vectorizer.fit_transform(X_train)
tf_test = vectorizer.transform(X_test)

In [7]:
model = MultinomialNB()
model.fit(tf_train, y_train)
y_pred_tf = model.predict(tf_test)

In [8]:
print(accuracy_score(y_test, y_pred_tf))
print(confusion_matrix(y_test, y_pred_tf))

0.8203240058910162
[[127   0   0   0   0   0   0   0   0   0   0   1   0   0   0  12   0   2
    0   2]
 [  0 128   0  11   1  21   1   1   0   0   0   5   0   1   3   2   0   1
    0   0]
 [  0  12  11  57  12  60   0   1   0   0   0  10   6   2   2   1   0   0
    3   0]
 [  0   4   0 140   8   9   3   2   0   0   0   4   5   1   0   1   0   0
    0   0]
 [  0   4   0  11 137   3   0   1   0   0   0   4   7   1   1   2   0   0
    2   0]
 [  0   8   0   1   0 159   0   0   0   1   0   4   1   0   3   0   1   0
    0   0]
 [  0   4   1  16   5   0  98  11   2   1   3  11   4   4  11   2   2   1
    0   0]
 [  0   0   0   0   0   3   3 160   1   0   0   0   4   1   1   0   1   1
    3   0]
 [  1   0   0   0   0   0   2   5 167   0   0   0   1   0   0   1   2   1
    0   0]
 [  0   1   0   0   0   0   0   0   0 162  10   0   1   0   1   1   0   1
    2   0]
 [  0   0   0   0   0   0   0   0   0   2 176   0   0   0   0   0   0   1
    1   0]
 [  0   1   1   0   0   0   0   0   0   0   0 

In [9]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

modelidf = MultinomialNB()
modelidf.fit(tfidf_train, y_train)
y_pred_tfidf = modelidf.predict(tfidf_test)

In [10]:
print(accuracy_score(y_test, y_pred_tfidf))
print(confusion_matrix(y_test, y_pred_tfidf))

0.8256259204712812
[[104   0   0   0   0   0   0   0   0   0   0   0   0   0   0  39   0   0
    0   1]
 [  0 125   5  13   1  10   1   2   0   1   0   8   0   0   1   8   0   0
    0   0]
 [  0   4 132  21   1   5   0   1   0   0   1   6   1   0   0   5   0   0
    0   0]
 [  0   2  10 139   8   0   4   2   0   0   1   5   4   1   0   1   0   0
    0   0]
 [  0   1   0   6 144   0   1   1   2   1   2   8   3   1   1   2   0   0
    0   0]
 [  0   3   3   3   0 150   0   1   0   1   1  11   1   0   1   3   0   0
    0   0]
 [  0   1   5  13   3   0 103  12   1   3   6   9   2   3   2   9   3   1
    0   0]
 [  0   0   0   0   0   0   3 168   0   0   3   1   1   0   1   0   1   0
    0   0]
 [  0   0   0   0   0   0   2   7 167   0   0   0   1   0   0   3   0   0
    0   0]
 [  0   0   0   0   0   0   1   1   1 166   7   0   1   0   1   1   0   0
    0   0]
 [  0   0   0   0   0   0   0   0   0   4 175   0   0   0   0   0   1   0
    0   0]
 [  0   1   1   0   0   0   0   0   0   0   0 

In [11]:
print("- Skup ima", y.nunique(), "klasa i relativno je balansiran → accuracy je adekvatna metrika.")
print("- TF-IDF obično daje nešto bolje rezultate jer smanjuje značaj vrlo čestih reči.")

- Skup ima 20 klasa i relativno je balansiran → accuracy je adekvatna metrika.
- TF-IDF obično daje nešto bolje rezultate jer smanjuje značaj vrlo čestih reči.
