In [51]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [53]:
df=pd.read_excel('news.xlsx')

In [54]:
df

Unnamed: 0,Category,Title,News_Article
0,Maraqlı,Naviforce Sport Saat 2016 ilə zövqlərin ahəngi,Naviforce Sport Saat 2016 Yapon Mexanizmi Yapo...
1,Maraqlı,"Sinir ,oynaq , sinir bel ağrılarına 3 gündə son !","ŞOK ! ŞOK ! ŞOK ! Xanımlar və bəylər , bel və ..."
2,Maraqlı,Dəyərindən qat-qat aşağı qiymətə Mənzil,Dəyərindən qat-qat Aşağı Qiymətə. Həzi Aslanov...
3,İdman,2024 və 2028-ci il olimpiadalarının keçiriləcə...,2028-ci il Yay Olimpiya və Paralimpiya Oyunla...
4,Dünya,Türkiyədə zəlzələ,Türkiyədə daha bir zəlzələ meydana gəlib. L...
...,...,...,...
49995,Siyasət,Əli Kərimli terrorçularla da əməkdaşlığa hazır...,"AXCP sədri ""Milli Şura""nın mitinqində qəsdən..."
49996,Siyasət,Elşən Musayev: “Bəlkə elə o terrorist Əli Kəri...,"""Ümumiyyətlə, Milli Şuraya xoş olan, hətta o..."
49997,Dünya,İstanbulda 52 mərtəbəli binada yanğın baş verib,Türkiyənin İstanbul şəhərinin Ataşehir rayon...
49998,Dünya,“Onlar cinayətkarlardır və bir-bir məhv edilir...,Rusiyanı tərk edərək İŞİD sıralarında döyüşə...


In [55]:
# Step 2: Preprocess the Text (Includes Title + News_Article)
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('azerbaijani')]  # Azerbaijani stopwords
    return ' '.join(words)

# Combine Title and News Article for a unified feature
df['Combined_Text'] = df['Title'] + ' ' + df['News_Article']
df['Adjusted_Text'] = df['Combined_Text'].apply(preprocess_text)

In [56]:
# Step 3: Split Data into Train and Test Sets
X = df['Adjusted_Text']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [57]:
# Step 4: Feature Extraction (Bag-of-Words and TF-IDF)
vocab_size = 500  # Experiment with different sizes

# Bag-of-Words
bow_vectorizer = CountVectorizer(max_features=vocab_size)
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [58]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=vocab_size)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [59]:
X_train_tfidf

<40000x500 sparse matrix of type '<class 'numpy.float64'>'
	with 1113278 stored elements in Compressed Sparse Row format>

In [None]:
# Step 5: Model Training and Evaluation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test set

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Confusion Matrix:\n", cm)

In [None]:
# a) Decision Tree Classifier
print("\n--- Decision Tree Classifier (Bag-of-Words) ---")
dt_params = {'max_depth': [10, 20, 30], 'min_samples_split': [2, 5, 10]}
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=3)
evaluate_model(dt_grid, X_train_bow, X_test_bow, y_train, y_test)

print("\n--- Decision Tree Classifier (TF-IDF) ---")
evaluate_model(dt_grid, X_train_tfidf, X_test_tfidf, y_train, y_test)


--- Decision Tree Classifier (Bag-of-Words) ---
Accuracy: 0.71
Precision: 0.71
Recall: 0.71
F1 Score: 0.71
Confusion Matrix:
 [[2128  407   10   78   69   53]
 [ 520 2270   35  113   94  140]
 [  36   74  116   34   14   30]
 [ 113  148   21  866   21  100]
 [ 112  147    7   25  942   31]
 [  83  189   11  120   20  823]]

--- Decision Tree Classifier (TF-IDF) ---
Accuracy: 0.70
Precision: 0.70
Recall: 0.70
F1 Score: 0.70
Confusion Matrix:
 [[2003  535    8   71   68   60]
 [ 522 2317   28   90   90  125]
 [  19  116   92   43   10   24]
 [  98  183   23  857   25   83]
 [ 105  177   11   17  935   19]
 [  89  235   20   88   10  804]]


In [None]:


# b) Naïve Bayes Classifier
print("\n--- Naïve Bayes Classifier (Bag-of-Words) ---")
nb_model = MultinomialNB()
evaluate_model(nb_model, X_train_bow, X_test_bow, y_train, y_test)

print("\n--- Naïve Bayes Classifier (TF-IDF) ---")
evaluate_model(nb_model, X_train_tfidf, X_test_tfidf, y_train, y_test)




--- Naïve Bayes Classifier (Bag-of-Words) ---
Accuracy: 0.75
Precision: 0.75
Recall: 0.75
F1 Score: 0.75
Confusion Matrix:
 [[2077  406   22   93   65   82]
 [ 590 2093   43  120  114  212]
 [  12   72  192   14   10    4]
 [  74   58   35 1001   19   82]
 [  47   54   19   20 1117    7]
 [  48   50   28   95   10 1015]]

--- Naïve Bayes Classifier (TF-IDF) ---
Accuracy: 0.74
Precision: 0.75
Recall: 0.74
F1 Score: 0.74
Confusion Matrix:
 [[2071  473    3   96   37   65]
 [ 567 2233    7  109   82  174]
 [  15  126  101   40    9   13]
 [  76   97    8 1001    8   79]
 [  58  118    5   20 1056    7]
 [  61   79    6  116    1  983]]


In [None]:

# c) Support Vector Classifier
print("\n--- Support Vector Classifier (Bag-of-Words) ---")
svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svc_grid = GridSearchCV(SVC(random_state=42), svc_params, cv=3)
evaluate_model(svc_grid, X_train_bow, X_test_bow, y_train, y_test)

print("\n--- Support Vector Classifier (TF-IDF) ---")
evaluate_model(svc_grid, X_train_tfidf, X_test_tfidf, y_train, y_test)


--- Support Vector Classifier (Bag-of-Words) ---
Accuracy: 0.80
Precision: 0.80
Recall: 0.80
F1 Score: 0.80
Confusion Matrix:
 [[2219  367    5   85   42   27]
 [ 429 2382   19  133   83  126]
 [  24   76  170   30    2    2]
 [  64   81   13 1054    8   49]
 [  35   66    3   19 1133    8]
 [  43   70    7   84    5 1037]]

--- Support Vector Classifier (TF-IDF) ---


In [22]:
X


Unnamed: 0,Processed_Text
0,naviforce sport zövqlərin ahəngi naviforce spo...
1,sinir oynaq sinir bel ağrılarına gündə son şok...
2,dəyərindən qat qat aşağı qiymətə mənzil dəyəri...
3,olimpiadalarının keçiriləcəyi şəhərlər müəyyən...
4,türkiyədə zəlzələ türkiyədə zəlzələ meydana gə...
...,...
49995,əli kərimli terrorçularla əməkdaşlığa hazırdır...
49996,elşən musayev terrorist əli kərimlinin adamıdı...
49997,i̇stanbulda mərtəbəli binada yanğın baş verib ...
49998,cinayətkarlardır məhv edilirlər peskov rusiyan...


In [23]:
y

Unnamed: 0,Category
0,Maraqlı
1,Maraqlı
2,Maraqlı
3,İdman
4,Dünya
...,...
49995,Siyasət
49996,Siyasət
49997,Dünya
49998,Dünya


In [60]:
from sklearn.decomposition import TruncatedSVD
import plotly.express as px

# Apply TruncatedSVD with 3 components
svd = TruncatedSVD(n_components=3, random_state=42)
X_reduced = svd.fit_transform(X_train_tfidf)  # X_train_tfidf is your TF-IDF matrix

# Create a DataFrame with reduced dimensions and categories
reduced_df = pd.DataFrame(X_reduced, columns=["x", "y", "z"])
reduced_df['category'] = y_train.values  # Add labels for visualization


In [61]:
# Plot 3D scatter plot
fig = px.scatter_3d(
    reduced_df,
    x='x', y='y', z='z',
    color='category',  # Color by category (news category)
    title="Dimensionality Reduction with TruncatedSVD",
    labels={"x": "x", "y": "y", "z": "z"},
)

# Show the plot
fig.show()
