In [13]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


file_path = "news.xls"




data = pd.read_excel(file_path)


print("Veri setinin ilk 5 satırı:")
print(data.head())


print("\nSütun isimleri:")
print(data.columns)


print("\nVeri seti bilgisi:")
print(data.info())



Veri setinin ilk 5 satırı:
                                             content category  \
0  Dışişleri Bakanı Davutoğlu, Yunanistan ile Tür...    dünya   
1  İsrail Gazze Şeridi'nin kuzeyindeki bir tarlay...    dünya   
2  Lübnan'ın başkenti Beyrut'ta düzenlenen bombal...    dünya   
3  KKTC'de Sendikal Platform genel grev başlattı....    dünya   
4  Türkiye'den yola çıkan Başak Bulut, Seçil Öznu...    dünya   

                                            headline  
0                           'Ortak vizyonumuz var'\r  
1        İsrail'den Gazze Şeridi'ne hava saldırısı\r  
2      Cenaze için geniş güvenlik önlemleri alındı\r  
3                Gözaltındaki sendikacılar serbest\r  
4  Bisikletle Asya'da 3 bin kilometre yol katetti...  

Sütun isimleri:
Index(['content', 'category', 'headline'], dtype='object')

Veri seti bilgisi:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41991 entries, 0 to 41990
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  -----

In [14]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
texts = data['content'].astype(str)    # Haber metinleri
labels = data['category'].astype(str)  # Kategoriler



In [20]:
def clean_text(text):
    text = text.lower()                       # Küçük harfe çevir
    text = re.sub(r'\d+', '', text)           # Sayıları kaldır
    text = re.sub(r'\W', ' ', text)           # Noktalama işaretlerini kaldır
    text = re.sub(r'\s+', ' ', text).strip() # Fazla boşlukları temizle
    return text

texts = texts.apply(clean_text)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels)


In [22]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [23]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)


In [24]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)


In [30]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr, zero_division=0))

print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb, zero_division=0))

print("\nLogistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

print("\nNaive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))





Logistic Regression Accuracy: 0.5587569948803429
Naive Bayes Accuracy: 0.5420883438504583

Logistic Regression Classification Report:
              precision    recall  f1-score   support

       dünya       0.53      0.53      0.53       745
     ekonomi       0.58      0.59      0.58       653
       genel       0.31      0.32      0.31      1334
      güncel       0.50      0.62      0.55      1169
kültür-sanat       0.67      0.45      0.54       231
     magazin       0.66      0.55      0.60       558
      planet       0.48      0.25      0.33       391
      sağlık       0.67      0.48      0.56       277
     siyaset       0.43      0.33      0.37       370
        spor       0.72      0.96      0.82      2000
   teknoloji       0.65      0.43      0.52       154
     türkiye       0.26      0.05      0.08       388
       yaşam       0.67      0.02      0.03       129

    accuracy                           0.56      8399
   macro avg       0.55      0.43      0.45      8399
