In [None]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords as STOPWORDS
from nltk import word_tokenize, sent_tokenize
stopWords = set(STOPWORDS.words('turkish'))

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
def preprocess(docs):
    result = []
    for d in docs:
        a = re.sub(r'[\'’\"”][\w]+ ', " " ,d)
        for stopword in stopWords:
            a = a.replace(" "+ stopword + " ", " ")
        a = re.sub(r'[“’‘\'\"”…]', "", a)
        a = re.sub(r'\d+', "", a)
        a = a.replace("  ", " ")
        result.append(a.lower())
    return result

In [None]:
def read_data(path):
    data = pd.read_csv(path)
    return data

path = "/content/drive/MyDrive/CS 445 Project 3/dataset/train.csv"
data = read_data(path)

path = "/content/drive/MyDrive/CS 445 Project 3/dataset/test.csv"
test_data = read_data(path)

In [None]:
data_text = data["text"]
data_label = data["label"]

In [None]:
test_data_text = test_data["text"]
test_data_label = test_data["label"]

In [None]:
data_text[0]

"Beşiktaş'ın eski teknik direktörü Slaven Bilic, Türkiye-Hırvatistan maçında yorumculuk yapmak üzere Lig TV ile anlaştı.\nEURO 2016'nın yayıncı kuruluşlarından biri olan Lig TV, Türkiye'nin D Grubu'nda Hırvatistan ile oynayacağı ilk maç için Slaven Bilic ile anlaşıldığını duyurdu.\nBeşiktaş'ın eski teknik direktörü Slaven Bilic, 12 Haziran Pazar günü TSİ 16:00'da başlayacak mücadelede yorumcu olacak.\nLig TV, Slaven Bilic'in yanı sıra A Milli Takım'ın efsane kalecilerinden Rüştü Reçber'in de bu karşılaşmanın yorumcularından biri olacağını açıkladı.\nEURO 2008'de Hırvatistan'ın teknik direktörü olan Slaven Bilic, çeyrek finalde Türkiye'ye rakip olmuş ve 120 dakikası 1-1 biten maçta A Milli Takımımıza penaltılarda elenmişti."

In [None]:
data_text_processed = preprocess(data_text)
test_data_text_processed = preprocess(test_data_text)

In [None]:
data_text_processed[0]

'beşiktaş eski teknik direktörü slaven bilic, türkiye-hırvatistan maçında yorumculuk yapmak üzere lig tv anlaştı.\neuro yayıncı kuruluşlarından olan lig tv, türkiye d grubu hırvatistan oynayacağı ilk maç slaven bilic anlaşıldığını duyurdu.\nbeşiktaş eski teknik direktörü slaven bilic, haziran pazar günü tsi̇ : başlayacak mücadelede yorumcu olacak.\nlig tv, slaven bilic yanı sıra a milli takım efsane kalecilerinden rüştü reçber karşılaşmanın yorumcularından olacağını açıkladı.\neuro hırvatistan teknik direktörü olan slaven bilic, çeyrek finalde türkiye rakip olmuş dakikası - biten maçta a milli takımımıza penaltılarda elenmişti.'

In [None]:
class_mapping = {
    "turkiye": 0,
    "dunya": 1,
    "spor": 2,
    "video": 3,
    "yazarlar": 4,
}

In [None]:
counts = {
    "turkiye": 0,
    "dunya": 0,
    "spor": 0,
    "video": 0,
    "yazarlar": 0, 
}

In [None]:
for label in data_label:
    counts[label] += 1

print(counts)

{'turkiye': 1630, 'dunya': 1606, 'spor': 1583, 'video': 1582, 'yazarlar': 1599}


In [None]:
counts = {
    "turkiye": 0,
    "dunya": 0,
    "spor": 0,
    "video": 0,
    "yazarlar": 0, 
}

In [None]:
for label in test_data_label:
    counts[label] += 1

print(counts)

{'turkiye': 421, 'dunya': 395, 'spor': 384, 'video': 408, 'yazarlar': 392}


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(data_text, data_label, test_size=0.1, random_state=1)

In [None]:
vectorizerList = ["TF", "TF-IDF"]
includeStopWordsList = [ "yes", "no" ]

def loopAllForNB(data_text, data_label, test_data_text, test_data_label, data_type):

    outFile = open("/content/drive/MyDrive/CS 445 Project 3/Naive Bayes_" + data_type + ".txt", "w")

    test_results = {}

    for includeStopWords in includeStopWordsList:
        for vectorizerType in vectorizerList:
            if includeStopWords == "yes" and vectorizerType == "TF":
                vectorizer = CountVectorizer()
            
            elif includeStopWords == "yes" and vectorizerType == "TF-IDF":
                vectorizer = TfidfVectorizer()
            
            elif includeStopWords == "no" and vectorizerType == "TF":
                vectorizer = CountVectorizer(stop_words=stopWords)
            
            elif includeStopWords == "no" and vectorizerType == "TF-IDF":
                vectorizer = TfidfVectorizer(stop_words=stopWords)

            else:
                print("What are doing here???")
                return

            X_train, X_valid, y_train, y_valid = train_test_split(data_text, data_label, test_size=0.1, random_state=1)

            title = "Vectorizer Type: " + vectorizerType + "\tInclude Stopwords: " + includeStopWords + "\tData Type: " + data_type
            
            outFile.write("*** Start For " + title + " ***\n")

            model = make_pipeline(vectorizer, MultinomialNB())

            print("Fitting data:", title)
            model.fit(X_train, y_train)

            # Predict the Validation Data
            predictions = model.predict(X_valid)

            # Accuracy of the Naive Bayes for validation
            accuracy_validation_nb = accuracy_score(y_valid, predictions)

            print("Classification Report for Naive Bayesian (Validation) -->", title)
            print(classification_report(y_valid, predictions))
            
            outFile.write("Classification Report for Naive Bayesian (Validation) --> " + title + "\n")
            outFile.write(classification_report(y_valid, predictions))
            outFile.write("\n\n\n")

            # Accuracy of the Naive Bayes for test
            model.fit(data_text, data_label)

            predictions = model.predict(test_data_text)

            accuracy_test_nb = accuracy_score(test_data_label, predictions)

            print("Classification Report for Naive Bayesian (Test) -->", title)
            print(classification_report(test_data_label, predictions))

            outFile.write("Classification Report for Naive Bayesian (Test) --> " + title + "\n")
            outFile.write(classification_report(test_data_label, predictions))
            outFile.write("\n\n\n")

            # Accuracy with Fine-Tuning (GridSearchCV)
            print("\n*** GridSearchCV Starts ***\n")
            vec_x = vectorizer.fit_transform(data_text)

            grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001]}, scoring="accuracy")
            grid.fit(vec_x, data_label)

            print("\t\tBest estimators:", grid.best_estimator_)
            print("\t\tBest params:", grid.best_params_)
            print("\t\tValidation Score (Fine-Tuning):", grid.best_score_)

            # Fine-Tuned NaiveBayes
            best_alpha = grid.best_params_["alpha"]
            print("\n*** Test Data Starts with Fine-Tune ***\n")
            model = make_pipeline(vectorizer, MultinomialNB(alpha=best_alpha))

            model.fit(data_text, data_label)
            predictions = model.predict(test_data_text)

            accuracy_test_nb_fine_tuned = accuracy_score(test_data_label, predictions)

            print("Classification Report for Naive Bayesian Fine-Tuned (Test) -->", title)
            print(classification_report(test_data_label, predictions))

            outFile.write("Classification Report for Naive Bayesian Fine-Tuned (Test) --> " + title + "\n")
            outFile.write(classification_report(test_data_label, predictions))
            outFile.write("\n\n")


            print("\n\n*** Accuracy Results ***")
            print("Accuracy Score for validation data:", accuracy_validation_nb)
            print("Accuracy Score for test data:", accuracy_test_nb)
            print("Accuracy Score for test data (fine-tuned):", accuracy_test_nb_fine_tuned, "\n\n\n")

            test_results[title] = accuracy_test_nb
            test_results[title+"(fineTuned)"] = accuracy_test_nb_fine_tuned

            outFile.write("\n\n*** Accuracy Results ***\n")
            outFile.write("Accuracy Score for validation data: " + str(accuracy_validation_nb) + "\n" )
            outFile.write("Accuracy Score for test data: " + str(accuracy_test_nb) + "\n" )
            outFile.write("Accuracy Score for test data (fine-tuned): " + str(accuracy_test_nb_fine_tuned) + "\n\n\n" )
            
            outFile.write("*** End For " + title + " ***\n\n\n")

    print("*** Final Results ***")
    outFile.write("*** Final Results ***\n")
    for key, value in test_results.items():
        print(key, " -->\t", value)
        outFile.write( key + " :\t" + str(value) + "\n" )

    outFile.close()


In [None]:
loopAllForNB(data_text, data_label, test_data_text, test_data_label, "normal")

Fitting data: Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal
Classification Report for Naive Bayesian (Validation) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal
              precision    recall  f1-score   support

       dunya       0.82      0.86      0.84       154
        spor       0.89      0.97      0.93       151
     turkiye       0.61      0.83      0.71       168
       video       0.93      0.24      0.38       156
    yazarlar       0.76      0.91      0.83       171

    accuracy                           0.76       800
   macro avg       0.80      0.76      0.74       800
weighted avg       0.80      0.76      0.73       800

Classification Report for Naive Bayesian (Test) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal
              precision    recall  f1-score   support

       dunya       0.80      0.85      0.82       395
        spor       0.89      0.95      0.92       384
     turkiye       0.58      0.77      0.

In [None]:
loopAllForNB(data_text_processed, data_label, test_data_text_processed, test_data_label, "preprocessed")

Fitting data: Vectorizer Type: TF	Include Stopwords: yes	Data Type: preprocessed
Classification Report for Naive Bayesian (Validation) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: preprocessed
              precision    recall  f1-score   support

       dunya       0.82      0.84      0.83       154
        spor       0.89      0.97      0.93       151
     turkiye       0.61      0.83      0.71       168
       video       0.88      0.24      0.38       156
    yazarlar       0.76      0.91      0.83       171

    accuracy                           0.76       800
   macro avg       0.79      0.76      0.74       800
weighted avg       0.79      0.76      0.74       800

Classification Report for Naive Bayesian (Test) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: preprocessed
              precision    recall  f1-score   support

       dunya       0.80      0.85      0.82       395
        spor       0.89      0.96      0.92       384
     turkiye       0.58

In [None]:
# *** Final Results ***
# Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal  -->  0.742
# Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal(fineTuned)  -->  0.781
# Vectorizer Type: TF-IDF	Include Stopwords: yes	Data Type: normal  -->  0.7055
# Vectorizer Type: TF-IDF	Include Stopwords: yes	Data Type: normal(fineTuned)  -->  0.7765
# Vectorizer Type: TF	Include Stopwords: no	Data Type: normal  -->  0.7465
# Vectorizer Type: TF	Include Stopwords: no	Data Type: normal(fineTuned)  -->  0.7795
# Vectorizer Type: TF-IDF	Include Stopwords: no	Data Type: normal  -->  0.717
# Vectorizer Type: TF-IDF	Include Stopwords: no	Data Type: normal(fineTuned)  -->  0.7755

In [None]:
vectorizerList = ["TF", "TF-IDF"]
includeStopWordsList = [ "yes", "no" ]

def loopAllforLogres(data_text, data_label, test_data_text, test_data_label, data_type):

    outFile = open("/content/drive/MyDrive/CS 445 Project 3/Logistic Regression_" + data_type + ".txt", "w")

    iterLimit = 400

    test_results = {}

    for includeStopWords in includeStopWordsList:
        for vectorizerType in vectorizerList:
            if includeStopWords == "yes" and vectorizerType == "TF":
                vectorizer = CountVectorizer()
            
            elif includeStopWords == "yes" and vectorizerType == "TF-IDF":
                vectorizer = TfidfVectorizer()
            
            elif includeStopWords == "no" and vectorizerType == "TF":
                vectorizer = CountVectorizer(stop_words=stopWords)
            
            elif includeStopWords == "no" and vectorizerType == "TF-IDF":
                vectorizer = TfidfVectorizer(stop_words=stopWords)

            else:
                print("What are doing here???")
                return

            X_train, X_valid, y_train, y_valid = train_test_split(data_text, data_label, test_size=0.1, random_state=1)

            title = "Vectorizer Type: " + vectorizerType + "\tInclude Stopwords: " + includeStopWords + "\tData Type: " + data_type
            
            outFile.write("*** Start For " + title + " ***\n")

            model = make_pipeline(vectorizer, LogisticRegression(max_iter=iterLimit, random_state=42))

            print("Fitting data:", title)
            model.fit(X_train, y_train)

            # Predict the Validation Data
            predictions = model.predict(X_valid)

            # Accuracy of the Naive Bayes for validation
            accuracy_validation_logres = accuracy_score(y_valid, predictions)

            print("Classification Report for Logistic Regression (Validation) -->", title)
            print(classification_report(y_valid, predictions))
            
            outFile.write("Classification Report for Logistic Regression (Validation) --> " + title + "\n")
            outFile.write(classification_report(y_valid, predictions))
            outFile.write("\n\n\n")

            # Accuracy of the Naive Bayes for test
            model.fit(data_text, data_label)

            predictions = model.predict(test_data_text)

            accuracy_test_logres = accuracy_score(test_data_label, predictions)

            print("Classification Report for Logistic Regression (Test) -->", title)
            print(classification_report(test_data_label, predictions))

            outFile.write("Classification Report for Logistic Regression (Test) --> " + title + "\n")
            outFile.write(classification_report(test_data_label, predictions))
            outFile.write("\n\n\n")

            # Accuracy with Fine-Tuning (GridSearchCV)
            print("\n*** GridSearchCV Starts ***\n")
            vec_x = vectorizer.fit_transform(data_text)

            grid = GridSearchCV(LogisticRegression(max_iter=iterLimit, random_state=42), param_grid={'C': [0.001,0.01,0.1,1,10,100]}, scoring="accuracy")
            grid.fit(vec_x, data_label)

            print("\t\tBest estimators:", grid.best_estimator_)
            print("\t\tBest params:", grid.best_params_)
            print("\t\tValidation Score (Fine-Tuning):", grid.best_score_)

            # Fine-Tuned NaiveBayes
            best_C = grid.best_params_["C"]
            print("\n*** Test Data Starts with Fine-Tune ***\n")
            model = make_pipeline(vectorizer, LogisticRegression(C=best_C, max_iter=iterLimit, random_state=42))

            model.fit(data_text, data_label)
            predictions = model.predict(test_data_text)

            accuracy_test_logres_fine_tuned = accuracy_score(test_data_label, predictions)

            print("Classification Report for Logistic Regression Fine-Tuned (Test) -->", title)
            print(classification_report(test_data_label, predictions))

            outFile.write("Classification Report for Logistic Regression Fine-Tuned (Test) --> " + title + "\n")
            outFile.write(classification_report(test_data_label, predictions))
            outFile.write("\n\n")


            print("\n\n*** Accuracy Results ***")
            print("Accuracy Score for validation data:", accuracy_validation_logres)
            print("Accuracy Score for test data:", accuracy_test_logres)
            print("Accuracy Score for test data (fine-tuned):", accuracy_test_logres_fine_tuned, "\n\n\n")

            test_results[title] = accuracy_test_logres
            test_results[title+"(fineTuned)"] = accuracy_test_logres_fine_tuned

            outFile.write("\n\n*** Accuracy Results ***\n")
            outFile.write("Accuracy Score for validation data: " + str(accuracy_validation_logres) + "\n" )
            outFile.write("Accuracy Score for test data: " + str(accuracy_test_logres) + "\n" )
            outFile.write("Accuracy Score for test data (fine-tuned): " + str(accuracy_test_logres_fine_tuned) + "\n\n\n" )
            
            outFile.write("*** End For " + title + " ***\n\n\n")

    print("*** Final Results ***")
    outFile.write("*** Final Results ***\n")
    for key, value in test_results.items():
        print(key, " -->\t", value)
        outFile.write( key + " :\t" + str(value) + "\n" )

    outFile.close()


In [None]:
loopAllforLogres(data_text, data_label, test_data_text, test_data_label, "normal")

Fitting data: Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal
Classification Report for Logistic Regression (Validation) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal
              precision    recall  f1-score   support

       dunya       0.86      0.81      0.83       154
        spor       0.93      0.93      0.93       151
     turkiye       0.79      0.76      0.77       168
       video       0.77      0.86      0.81       156
    yazarlar       0.93      0.92      0.93       171

    accuracy                           0.86       800
   macro avg       0.86      0.86      0.86       800
weighted avg       0.86      0.86      0.86       800

Classification Report for Logistic Regression (Test) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: normal
              precision    recall  f1-score   support

       dunya       0.83      0.84      0.84       395
        spor       0.94      0.90      0.92       384
     turkiye       0.80      0.

In [None]:
loopAllforLogres(data_text, data_label, test_data_text, test_data_label, "preprocessed")

Fitting data: Vectorizer Type: TF	Include Stopwords: yes	Data Type: preprocessed
Classification Report for Logistic Regression (Validation) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: preprocessed
              precision    recall  f1-score   support

       dunya       0.86      0.81      0.83       154
        spor       0.93      0.93      0.93       151
     turkiye       0.79      0.76      0.77       168
       video       0.77      0.86      0.81       156
    yazarlar       0.93      0.92      0.93       171

    accuracy                           0.86       800
   macro avg       0.86      0.86      0.86       800
weighted avg       0.86      0.86      0.86       800

Classification Report for Logistic Regression (Test) --> Vectorizer Type: TF	Include Stopwords: yes	Data Type: preprocessed
              precision    recall  f1-score   support

       dunya       0.83      0.84      0.84       395
        spor       0.94      0.90      0.92       384
     turkiye 

# Naive Bayes Classifier with TF-IDF Vectorizer No Stopwords


In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Naive Bayes for validation
accuracy_validation_nb = accuracy_score(y_valid, predictions)

print("Classification Report for Naive Bayesian (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Naive Bayes for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_nb = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian (Test):")
print(classification_report(test_data_label, predictions))

# Accuract with Fine-Tuning (GridSearchCV)
print("\n*** GridSearchCV Starts ***\n")
vectorizer = TfidfVectorizer()
vec_x = vectorizer.fit_transform(data_text)

grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001]}, scoring="accuracy")
grid.fit(vec_x, data_label)

print("Best estimators:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Validation Score (Fine-Tuning):", grid.best_score_)

# Fine-Tuned NaiveBayes
best_alpha = grid.best_params_["alpha"]
print("\n*** Test Data Starts with Fine-Tune ***\n")
model = make_pipeline(TfidfVectorizer(), MultinomialNB(alpha=best_alpha))

model.fit(data_text, data_label)
predictions = model.predict(test_data_text)

accuracy_test_nb_fine_tuned = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian Fine-Tuned (Test):")
print(classification_report(test_data_label, predictions))


print("\n\n*** Accuracy Results ***")
print("Accuracy Score for validation data:", accuracy_validation_nb)
print("Accuracy Score for test data:", accuracy_test_nb)
print("Accuracy Score for test data (fine-tuned):", accuracy_test_nb_fine_tuned)



*** Validation Data Starts ***

Classification Report for Naive Bayesian (Validation):
              precision    recall  f1-score   support

       dunya       0.82      0.74      0.78       154
        spor       0.89      0.97      0.93       151
     turkiye       0.65      0.74      0.69       168
       video       0.94      0.29      0.45       156
    yazarlar       0.62      0.94      0.75       171

    accuracy                           0.74       800
   macro avg       0.79      0.74      0.72       800
weighted avg       0.78      0.74      0.72       800


*** Test Data Starts ***

Classification Report for Naive Bayesian (Test):
              precision    recall  f1-score   support

       dunya       0.81      0.77      0.79       395
        spor       0.90      0.94      0.92       384
     turkiye       0.59      0.68      0.64       421
       video       0.94      0.23      0.37       408
    yazarlar       0.61      0.98      0.75       392

    accuracy         

# Naive Bayes Classifier with TF-IDF Vectorizer with Stopwords

In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(stop_words=stopWords), MultinomialNB())

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Naive Bayes for validation
accuracy_validation_nb = accuracy_score(y_valid, predictions)

print("Classification Report for Naive Bayesian (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Naive Bayes for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_nb = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian (Test):")
print(classification_report(test_data_label, predictions))

# Accuract with Fine-Tuning (GridSearchCV)
print("\n*** GridSearchCV Starts ***\n")
vectorizer = TfidfVectorizer()
vec_x = vectorizer.fit_transform(data_text)

grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001]}, scoring="accuracy")
grid.fit(vec_x, data_label)

print("Best estimators:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Validation Score (Fine-Tuning):", grid.best_score_)

# Fine-Tuned NaiveBayes
best_alpha = grid.best_params_["alpha"]
print("\n*** Test Data Starts with Fine-Tune ***\n")
model = make_pipeline(TfidfVectorizer(), MultinomialNB(alpha=best_alpha))

model.fit(data_text, data_label)
predictions = model.predict(test_data_text)

accuracy_test_nb_fine_tuned = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian Fine-Tuned (Test):")
print(classification_report(test_data_label, predictions))


print("\n\n*** Accuracy Results ***")
print("Accuracy Score for validation data:", accuracy_validation_nb)
print("Accuracy Score for test data:", accuracy_test_nb)
print("Accuracy Score for test data (fine-tuned):", accuracy_test_nb_fine_tuned)



*** Validation Data Starts ***

Classification Report for Naive Bayesian (Validation):
              precision    recall  f1-score   support

       dunya       0.81      0.77      0.79       154
        spor       0.89      0.97      0.93       151
     turkiye       0.65      0.74      0.69       168
       video       0.96      0.29      0.44       156
    yazarlar       0.63      0.93      0.75       171

    accuracy                           0.74       800
   macro avg       0.79      0.74      0.72       800
weighted avg       0.78      0.74      0.72       800


*** Test Data Starts ***

Classification Report for Naive Bayesian (Test):
              precision    recall  f1-score   support

       dunya       0.80      0.78      0.79       395
        spor       0.90      0.95      0.92       384
     turkiye       0.60      0.69      0.64       421
       video       0.94      0.22      0.36       408
    yazarlar       0.61      0.97      0.75       392

    accuracy         

# Naive Bayes Classifier with TF Vectorizer NO Stopwords

In [None]:
# Build the model
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Naive Bayes for validation
accuracy_validation_nb = accuracy_score(y_valid, predictions)

print("Classification Report for Naive Bayesian (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Naive Bayes for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_nb = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian (Test):")
print(classification_report(test_data_label, predictions))

# Accuract with Fine-Tuning (GridSearchCV)
print("\n*** GridSearchCV Starts ***\n")
vectorizer = TfidfVectorizer()
vec_x = vectorizer.fit_transform(data_text)

grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001]}, scoring="accuracy")
grid.fit(vec_x, data_label)

print("Best estimators:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Validation Score (Fine-Tuning):", grid.best_score_)

# Fine-Tuned NaiveBayes
best_alpha = grid.best_params_["alpha"]
print("\n*** Test Data Starts with Fine-Tune ***\n")
model = make_pipeline(TfidfVectorizer(), MultinomialNB(alpha=best_alpha))

model.fit(data_text, data_label)
predictions = model.predict(test_data_text)

accuracy_test_nb_fine_tuned = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian Fine-Tuned (Test):")
print(classification_report(test_data_label, predictions))


print("\n\n*** Accuracy Results ***")
print("Accuracy Score for validation data:", accuracy_validation_nb)
print("Accuracy Score for test data:", accuracy_test_nb)
print("Accuracy Score for test data (fine-tuned):", accuracy_test_nb_fine_tuned)



*** Validation Data Starts ***

Classification Report for Naive Bayesian (Validation):
              precision    recall  f1-score   support

       dunya       0.82      0.86      0.84       154
        spor       0.89      0.97      0.93       151
     turkiye       0.61      0.83      0.71       168
       video       0.93      0.24      0.38       156
    yazarlar       0.76      0.91      0.83       171

    accuracy                           0.76       800
   macro avg       0.80      0.76      0.74       800
weighted avg       0.80      0.76      0.73       800


*** Test Data Starts ***

Classification Report for Naive Bayesian (Test):
              precision    recall  f1-score   support

       dunya       0.80      0.85      0.82       395
        spor       0.89      0.95      0.92       384
     turkiye       0.58      0.77      0.66       421
       video       0.92      0.20      0.32       408
    yazarlar       0.72      0.96      0.83       392

    accuracy         

# Naive Bayes Classifier with TF Vectorizer with Stopwords


In [None]:
# Build the model
model = make_pipeline(CountVectorizer(stop_words=stopWords), MultinomialNB())

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Naive Bayes for validation
accuracy_validation_nb = accuracy_score(y_valid, predictions)

print("Classification Report for Naive Bayesian (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Naive Bayes for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_nb = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian (Test):")
print(classification_report(test_data_label, predictions))

# Accuract with Fine-Tuning (GridSearchCV)
print("\n*** GridSearchCV Starts ***\n")
vectorizer = TfidfVectorizer()
vec_x = vectorizer.fit_transform(data_text)

grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001]}, scoring="accuracy")
grid.fit(vec_x, data_label)

print("Best estimators:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Validation Score (Fine-Tuning):", grid.best_score_)

# Fine-Tuned NaiveBayes
best_alpha = grid.best_params_["alpha"]
print("\n*** Test Data Starts with Fine-Tune ***\n")
model = make_pipeline(TfidfVectorizer(), MultinomialNB(alpha=best_alpha))

model.fit(data_text, data_label)
predictions = model.predict(test_data_text)

accuracy_test_nb_fine_tuned = accuracy_score(test_data_label, predictions)

print("Classification Report for Naive Bayesian Fine-Tuned (Test):")
print(classification_report(test_data_label, predictions))


print("\n\n*** Accuracy Results ***")
print("Accuracy Score for validation data:", accuracy_validation_nb)
print("Accuracy Score for test data:", accuracy_test_nb)
print("Accuracy Score for test data (fine-tuned):", accuracy_test_nb_fine_tuned)



*** Validation Data Starts ***

Classification Report for Naive Bayesian (Validation):
              precision    recall  f1-score   support

       dunya       0.83      0.84      0.84       154
        spor       0.90      0.97      0.93       151
     turkiye       0.61      0.84      0.71       168
       video       0.88      0.24      0.37       156
    yazarlar       0.76      0.91      0.83       171

    accuracy                           0.76       800
   macro avg       0.79      0.76      0.74       800
weighted avg       0.79      0.76      0.74       800


*** Test Data Starts ***

Classification Report for Naive Bayesian (Test):
              precision    recall  f1-score   support

       dunya       0.79      0.86      0.82       395
        spor       0.89      0.96      0.92       384
     turkiye       0.58      0.77      0.66       421
       video       0.91      0.21      0.34       408
    yazarlar       0.75      0.95      0.84       392

    accuracy         

# Logistic Regression Classifier TF-IDF NO Stopwords



In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Logistic Regression for validation
accuracy_validation_logres = accuracy_score(y_valid, predictions)

print("Classification Report for Logistic Regression (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Logistic Regression for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_logres = accuracy_score(test_data_label, predictions)

print("Classification Report for Logistic Regression (Test):")
print(classification_report(test_data_label, predictions))

# # Accuract with Fine-Tuning (GridSearchCV)
# print("\n*** GridSearchCV Starts ***\n")
# vectorizer = TfidfVectorizer()
# vec_x = vectorizer.fit_transform(data_text)

# # solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
# # c_values = [100, 10, 1.0, 0.1, 0.01]

# parameters = {
#     "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
#     "C": [100, 10, 1.0, 0.1, 0.01],
# }

# grid = GridSearchCV(LogisticRegression(max_iter=200), param_grid=parameters, scoring="accuracy")
# grid.fit(vec_x, data_label)

# print("Best estimators:", grid.best_estimator_)
# print("Best params:", grid.best_params_)
# print("Validation Score (Fine-Tuning):", grid.best_score_)


*** Validation Data Starts ***

Classification Report for Logistic Regression (Validation):
              precision    recall  f1-score   support

       dunya       0.84      0.84      0.84       154
        spor       0.91      0.94      0.93       151
     turkiye       0.78      0.76      0.77       168
       video       0.80      0.78      0.79       156
    yazarlar       0.88      0.92      0.90       171

    accuracy                           0.84       800
   macro avg       0.84      0.85      0.84       800
weighted avg       0.84      0.84      0.84       800


*** Test Data Starts ***

Classification Report for Logistic Regression (Test):
              precision    recall  f1-score   support

       dunya       0.82      0.86      0.84       395
        spor       0.94      0.94      0.94       384
     turkiye       0.78      0.68      0.73       421
       video       0.77      0.78      0.78       408
    yazarlar       0.87      0.94      0.91       392

    accurac

# Logistic Regression Classifier TF-IDF with Stopwords

In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(stop_words=stopWords), LogisticRegression(max_iter=200))

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Logistic Regression for validation
accuracy_validation_logres = accuracy_score(y_valid, predictions)

print("Classification Report for Logistic Regression (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Logistic Regression for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_logres = accuracy_score(test_data_label, predictions)

print("Classification Report for Logistic Regression (Test):")
print(classification_report(test_data_label, predictions))

# # Accuract with Fine-Tuning (GridSearchCV)
# print("\n*** GridSearchCV Starts ***\n")
# vectorizer = TfidfVectorizer()
# vec_x = vectorizer.fit_transform(data_text)

# # solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
# # c_values = [100, 10, 1.0, 0.1, 0.01]

# parameters = {
#     "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
#     "C": [100, 10, 1.0, 0.1, 0.01],
# }

# grid = GridSearchCV(LogisticRegression(max_iter=200), param_grid=parameters, scoring="accuracy")
# grid.fit(vec_x, data_label)

# print("Best estimators:", grid.best_estimator_)
# print("Best params:", grid.best_params_)
# print("Validation Score (Fine-Tuning):", grid.best_score_)


*** Validation Data Starts ***

Classification Report for Logistic Regression (Validation):
              precision    recall  f1-score   support

       dunya       0.83      0.83      0.83       154
        spor       0.91      0.95      0.93       151
     turkiye       0.75      0.72      0.73       168
       video       0.76      0.74      0.75       156
    yazarlar       0.89      0.89      0.89       171

    accuracy                           0.83       800
   macro avg       0.82      0.83      0.83       800
weighted avg       0.82      0.83      0.83       800


*** Test Data Starts ***

Classification Report for Logistic Regression (Test):
              precision    recall  f1-score   support

       dunya       0.82      0.87      0.84       395
        spor       0.93      0.93      0.93       384
     turkiye       0.77      0.69      0.73       421
       video       0.78      0.76      0.77       408
    yazarlar       0.86      0.92      0.89       392

    accurac

# Logistic Regression Classifier TF NO Stopword

In [None]:
# Build the model
model = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=400))

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Logistic Regression for validation
accuracy_validation_logres = accuracy_score(y_valid, predictions)

print("Classification Report for Logistic Regression (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Logistic Regression for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_logres = accuracy_score(test_data_label, predictions)

print("Classification Report for Logistic Regression (Test):")
print(classification_report(test_data_label, predictions))

# # Accuract with Fine-Tuning (GridSearchCV)
# print("\n*** GridSearchCV Starts ***\n")
# vectorizer = TfidfVectorizer()
# vec_x = vectorizer.fit_transform(data_text)

# # solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
# # c_values = [100, 10, 1.0, 0.1, 0.01]

# parameters = {
#     "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
#     "C": [100, 10, 1.0, 0.1, 0.01],
# }

# grid = GridSearchCV(LogisticRegression(max_iter=200), param_grid=parameters, scoring="accuracy")
# grid.fit(vec_x, data_label)

# print("Best estimators:", grid.best_estimator_)
# print("Best params:", grid.best_params_)
# print("Validation Score (Fine-Tuning):", grid.best_score_)


*** Validation Data Starts ***

Classification Report for Logistic Regression (Validation):
              precision    recall  f1-score   support

       dunya       0.86      0.81      0.83       154
        spor       0.93      0.93      0.93       151
     turkiye       0.79      0.76      0.77       168
       video       0.77      0.86      0.81       156
    yazarlar       0.93      0.92      0.93       171

    accuracy                           0.86       800
   macro avg       0.86      0.86      0.86       800
weighted avg       0.86      0.86      0.86       800


*** Test Data Starts ***

Classification Report for Logistic Regression (Test):
              precision    recall  f1-score   support

       dunya       0.83      0.84      0.84       395
        spor       0.94      0.90      0.92       384
     turkiye       0.80      0.69      0.74       421
       video       0.75      0.88      0.81       408
    yazarlar       0.91      0.92      0.91       392

    accurac

# Logistic Regression Classifier TF With Stopwords

In [None]:
# Build the model
model = make_pipeline(CountVectorizer(stop_words=stopWords), LogisticRegression(max_iter=1000))

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Logistic Regression for validation
accuracy_validation_logres = accuracy_score(y_valid, predictions)

print("Classification Report for Logistic Regression (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Logistic Regression for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_logres = accuracy_score(test_data_label, predictions)

print("Classification Report for Logistic Regression (Test):")
print(classification_report(test_data_label, predictions))

# Accuract with Fine-Tuning (GridSearchCV)
print("\n*** GridSearchCV Starts ***\n")
vectorizer = CountVectorizer()
vec_x = vectorizer.fit_transform(data_text)

# solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
# c_values = [100, 10, 1.0, 0.1, 0.01]

parameters = {
    "C": [100, 10, 1.0, 0.1, 0.01],
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=parameters, scoring="accuracy")
grid.fit(vec_x, data_label)

print("Best estimators:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Validation Score (Fine-Tuning):", grid.best_score_)


*** Validation Data Starts ***

Classification Report for Logistic Regression (Validation):
              precision    recall  f1-score   support

       dunya       0.85      0.79      0.82       154
        spor       0.94      0.95      0.95       151
     turkiye       0.78      0.74      0.76       168
       video       0.77      0.85      0.80       156
    yazarlar       0.91      0.91      0.91       171

    accuracy                           0.85       800
   macro avg       0.85      0.85      0.85       800
weighted avg       0.85      0.85      0.85       800


*** Test Data Starts ***

Classification Report for Logistic Regression (Test):
              precision    recall  f1-score   support

       dunya       0.83      0.84      0.84       395
        spor       0.94      0.92      0.93       384
     turkiye       0.82      0.68      0.75       421
       video       0.75      0.88      0.81       408
    yazarlar       0.92      0.92      0.92       392

    accurac

In [None]:
# Build the model
model = make_pipeline(CountVectorizer(stop_words=stopWords), LogisticRegression(max_iter=1000, C=0.1))

# Fit the Train Data
model.fit(X_train, y_train)

print("\n*** Validation Data Starts ***\n")

# Predict the Validation Data
predictions = model.predict(X_valid)

# Accuracy of the Logistic Regression for validation
accuracy_validation_logres = accuracy_score(y_valid, predictions)

print("Classification Report for Logistic Regression (Validation):")
print(classification_report(y_valid, predictions))

# Accuracy of the Logistic Regression for test
print("\n*** Test Data Starts ***\n")
model.fit(data_text, data_label)

predictions = model.predict(test_data_text)

accuracy_test_logres = accuracy_score(test_data_label, predictions)

print("Classification Report for Logistic Regression (Test):")
print(classification_report(test_data_label, predictions))



*** Validation Data Starts ***

Classification Report for Logistic Regression (Validation):
              precision    recall  f1-score   support

       dunya       0.85      0.81      0.83       154
        spor       0.94      0.95      0.94       151
     turkiye       0.79      0.76      0.77       168
       video       0.76      0.84      0.80       156
    yazarlar       0.93      0.91      0.92       171

    accuracy                           0.85       800
   macro avg       0.85      0.85      0.85       800
weighted avg       0.85      0.85      0.85       800


*** Test Data Starts ***



# Logistic Regression

In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(), LogisticRegression())

In [None]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercep

In [None]:
predictions = model.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

0.845

In [None]:
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, predictions)) 

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

       dunya       0.84      0.84      0.84       154
        spor       0.91      0.94      0.93       151
     turkiye       0.78      0.76      0.77       168
       video       0.80      0.78      0.79       156
    yazarlar       0.88      0.92      0.90       171

    accuracy                           0.84       800
   macro avg       0.84      0.85      0.84       800
weighted avg       0.84      0.84      0.84       800



In [None]:
predictions = model.predict(test_data_text)

In [None]:
accuracy_score(test_data_label, predictions)

0.8325

In [None]:
print("Classification Report for Logistic Regression:")
print(classification_report(test_data_label, predictions)) 

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

       dunya       0.82      0.86      0.84       395
        spor       0.94      0.93      0.94       384
     turkiye       0.77      0.67      0.72       421
       video       0.77      0.78      0.77       408
    yazarlar       0.86      0.94      0.90       392

    accuracy                           0.83      2000
   macro avg       0.83      0.84      0.83      2000
weighted avg       0.83      0.83      0.83      2000



# GridSearch for Logistic Regression

In [None]:
vectorizer = TfidfVectorizer()
vec_x = vectorizer.fit_transform(X_train)

params = {
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "C": np.logspace(-3,3,7),
}

# solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’


grid = GridSearchCV(LogisticRegression(), param_grid=params, scoring="accuracy")
grid.fit(vec_x, y_train)
print(grid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)


In [None]:
grid.best_estimator_

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
grid.best_params_

{'C': 10.0, 'solver': 'liblinear'}

In [None]:
grid.best_score_

0.8368055555555556

# Naive Bayes

In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [None]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [None]:
predictions = model.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

0.7275

In [None]:
print("Classification Report for Naive Bayesian:")
print(classification_report(y_test, predictions)) 

Classification Report for Naive Bayesian:
              precision    recall  f1-score   support

       dunya       0.84      0.73      0.78       154
        spor       0.88      0.95      0.91       151
     turkiye       0.65      0.73      0.69       168
       video       0.95      0.27      0.42       156
    yazarlar       0.60      0.95      0.73       171

    accuracy                           0.73       800
   macro avg       0.78      0.72      0.71       800
weighted avg       0.78      0.73      0.71       800



In [None]:
predictions = model.predict(test_data_text)

In [None]:
accuracy_score(test_data_label, predictions)

0.7035

In [None]:
print("Classification Report for Naive Bayesian:")
print(classification_report(test_data_label, predictions)) 

Classification Report for Naive Bayesian:
              precision    recall  f1-score   support

       dunya       0.80      0.77      0.78       395
        spor       0.89      0.94      0.92       384
     turkiye       0.59      0.67      0.62       421
       video       0.95      0.19      0.32       408
    yazarlar       0.58      0.98      0.73       392

    accuracy                           0.70      2000
   macro avg       0.76      0.71      0.68      2000
weighted avg       0.76      0.70      0.67      2000



# GridSearch for Naive Bayes

In [None]:
vectorizer = TfidfVectorizer()
vec_x = vectorizer.fit_transform(X_train)

grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001]}, scoring="accuracy")
grid.fit(vec_x, y_train)
print(grid)

GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)


In [None]:
grid.best_estimator_

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [None]:
grid.best_params_

{'alpha': 0.01}

In [None]:
grid.best_score_

0.7848611111111111

In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB(alpha=0.01))

model.fit(data_text, data_label)
predictions = model.predict(test_data_text)

In [None]:
print("Classification Report for Naive Bayesian:")
print(classification_report(test_data_label, predictions)) 

Classification Report for Naive Bayesian:
              precision    recall  f1-score   support

       dunya       0.82      0.84      0.83       395
        spor       0.91      0.95      0.93       384
     turkiye       0.64      0.67      0.66       421
       video       0.76      0.47      0.58       408
    yazarlar       0.76      0.96      0.85       392

    accuracy                           0.78      2000
   macro avg       0.78      0.78      0.77      2000
weighted avg       0.78      0.78      0.77      2000

