In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
from sklearn import preprocessing, decomposition
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix
import itertools
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline
from yellowbrick.text import FreqDistVisualizer
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

In [None]:
data = pd.read_csv('main_dataframe.csv')

In [None]:
data.rename(columns={'Content': 'content', 'BIRADS Score': 'label'}, inplace=True)

In [None]:
# Basic preprocessing
data['content'] = data['content'].str.lower().str.replace(r'[^\w\s]', '')


In [None]:

from sklearn.preprocessing import LabelEncoder


turkish_stop_words = [
    'acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü',
    'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile',
    'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin',
    'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'
]


# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['content'], data['label'], test_size=0.125, stratify=data['label'], random_state=42)



label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)



# Initialize and fit-transform the TF-IDF vectorizer
tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                        ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
                        stop_words=turkish_stop_words)

tfidf.fit(list(train_texts) + list(val_texts))
X_train_tfidf = tfidf.transform(train_texts)
X_val_tfidf = tfidf.transform(val_texts)

# Convert the TF-IDF matrices to DMatrix, the data structure that XGBoost uses
dtrain = xgb.DMatrix(X_train_tfidf, label=train_labels_encoded)
dval = xgb.DMatrix(X_val_tfidf, label=val_labels_encoded)


# Set the parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': 5,                # Number of classes
    'max_depth': 6,                # Maximum depth of the tree
    'eta': 0.3,                    # Learning rate
    'eval_metric': 'mlogloss',     # Evaluation metric
    'seed': 42                     # Random seed for reproducibility
}

# Train the model
num_rounds = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_rounds)

# Make predictions
val_predictions = bst.predict(dval)

target_names = ["BIRADS-1","BIRADS-2","BIRADS-3","BIRADS-4","BIRADS-5"]

# Print classification report
print(classification_report(val_labels_encoded, val_predictions, target_names= target_names))

# Print confusion matrix
conf_matrix = confusion_matrix(val_labels_encoded, val_predictions)
print("Confusion Matrix:")
print(conf_matrix)




              precision    recall  f1-score   support

    BIRADS-1       0.67      0.33      0.44         6
    BIRADS-2       0.89      0.84      0.87        50
    BIRADS-3       0.92      1.00      0.96        56
    BIRADS-4       0.97      0.99      0.98        69
    BIRADS-5       1.00      1.00      1.00        69

    accuracy                           0.95       250
   macro avg       0.89      0.83      0.85       250
weighted avg       0.94      0.95      0.94       250

Confusion Matrix:
[[ 2  4  0  0  0]
 [ 1 42  5  2  0]
 [ 0  0 56  0  0]
 [ 0  1  0 68  0]
 [ 0  0  0  0 69]]


In [None]:
import xgboost as xgb
import pickle

# Assuming `bst` is your trained XGBoost model

# Define the file path where you want to save the model
model_file_path = 'xgboost_model.bin'

# Save the model
bst.save_model(model_file_path)

# If you also want to save the label encoder for later use
encoder_file_path = 'label_encoder.pkl'
with open(encoder_file_path, 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)
