In [8]:
# Core Data Manipulation and ML
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from nltk.corpus import stopwords
import nltk

In [None]:
df = pd.read_csv("data/preprocessed_reviews.csv")
df

Unnamed: 0,Comment,Sentiment,comment_english,copy_comment,STREM+FPS3,STREM+FPS5,STREM+FPS7,STREM+ZEM
0,evet anlatıldığı gibi,1,evet anlatildigi,evet anlatıldığı,eve anl,evet anlat,evet anlatil,evet anla
1,Daha öncede almıştım bu cihazdan ense ve sakal...,1,oncede almistim cihazdan ense sakal tuketmek i...,öncede almıştım cihazdan ense sakal tüketmek n...,onc alm cih ens sak tuk ici num sif yak ali,onced almis cihaz ense sakal tuket icin numar ...,oncede almisti cihazda ense sakal tuketme icin...,önce al cihaz ense sakal tüket numara sıfır ya...
2,Ürün gayet başarılı sakal kesmede başlık sayıs...,1,urun basarili sakal kesmede baslik sayisi fazl...,ürün başarılı sakal kesmede başlık sayısı fazl...,uru bas sak kes bas say faz ola 0 yak ali kir ...,urun basar sakal kesme basli sayis fazla olabi...,urun basaril sakal kesmede baslik sayisi fazla...,ürün başarı sakal kes başlık sayı fazla olabil...
3,Daha öncede aynısını almıştım çok güzel ve kal...,1,oncede aynisini almistim cok guzel kaliteli ur...,öncede aynısını almıştım güzel kaliteli ürün .,onc ayn alm cok guz kal uru,onced aynis almis cok guzel kalit urun,oncede aynisin almisti cok guzel kalitel urun,önce aynı al güzel kalite ürün
4,Erkek kuaförüyüm ense ve sıfır sakal traşı içi...,1,erkek kuaforuyum ense sifir sakal trasi icin u...,erkek kuaförüyüm ense sıfır sakal traşı uygun ...,erk kua ens sif sak tra ici uyg uru,erkek kuafo ense sifir sakal trasi icin uygun ...,erkek kuaforu ense sifir sakal trasi icin uygu...,erkek kuaför ense sıfır sakal traş uygun ürün
...,...,...,...,...,...,...,...,...
15165,ışık seviyesi rezalet,0,isik seviyesi rezalet,ışık seviyesi rezalet,isi sev rez,isik seviy rezal,isik seviyes rezalet,ışık seviye rezalet
15166,Hic begenmedim. Aydinlatma hic yok ve her kapa...,0,hic begenmedim . aydinlatma hic kapattigimda a...,hic begenmedim . aydinlatma hic kapattigimda a...,hic beg ayd hic kap aya sif,hic begen aydin hic kapat ayari sifir,hic begenme aydinla hic kapatti ayari sifirla,hic begenmedim aydinlatma hic kapattigimda aya...
15167,2 gün sonra hoparlörü bozuldu kullanışsız,0,2 gun hoparloru bozuldu kullanissiz,2 gün hoparlörü bozuldu kullanışsız,2 gun hop boz kul,2 gun hopar bozul kulla,2 gun hoparlo bozuldu kullani,2 gün hoparlör boz kullan
15168,aşırı boğuk bir sesi ve rengi var kumanda heme...,0,asiri boguk sesi rengi kumanda hemen bozuldu .,aşırı boğuk sesi rengi kumanda hemen bozuldu .,asi bog ses ren kum hem boz,asiri boguk sesi rengi kuman hemen bozul,asiri boguk sesi rengi kumanda hemen bozuldu,aşırı boğuk ses renk kumanda hemen boz


# Decision Tree with our Preprocessing

In [5]:
# We are doing TF-IDF Vectorization with unigrams and bigrams to convert text data into numerical format
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=20000,
    min_df=3,
    max_df=0.9
)

X = tfidf.fit_transform(df["comment_english"])
y = df["Sentiment"]

In [6]:
# Splitting the dataset into training, testing and validation sets (60% train, 20% validation, 20% test)
X_temp, X_test, y_temp, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=0.25,
    random_state=42,
    stratify=y_temp
)
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 108595 stored elements and shape (9102, 10948)>

In [9]:
criteria = ['gini', 'entropy']
# Assuming your numerical labels are 0, 1, 2
target_names = ['Negative (0)', 'Positive (1)', 'Neutral (2)']

print("\n--- Comparing Decision Tree Criteria: Gini vs. Entropy ---")

for criterion in criteria:
    print(f"\n=======================================================")
    print(f"       MODEL TRAINING WITH CRITERION: '{criterion}'")
    print(f"=======================================================")

    # Initialize the Decision Tree model (using max_depth=10 for fair comparison)
    dt_classifier = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=10,
        random_state=42
    )

    # Train the model
    dt_classifier.fit(X_train, y_train)

    # Make predictions
    y_pred = dt_classifier.predict(X_test)

    # --- Model Evaluation ---

    # Classification Report
    print("\nClassification Report:\n")
    # Setting zero_division=0 prevents warnings if a class has no predicted samples
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))

    # Confusion Matrix Calculation
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:\n")
    print(cm)

    # Visualize the Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Negative', 'Positive', 'Neutral'],
        yticklabels=['Negative', 'Positive', 'Neutral']
    )
    plt.title(f'Decision Tree Confusion Matrix (Criterion: {criterion})')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.savefig(f'dt_confusion_matrix_{criterion}.png')
    plt.close()


--- Comparing Decision Tree Criteria: Gini vs. Entropy ---

       MODEL TRAINING WITH CRITERION: 'gini'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.63      0.90      0.74      1395
Positive (1)       0.79      0.56      0.66      1360
 Neutral (2)       0.63      0.20      0.31       279

    accuracy                           0.68      3034
   macro avg       0.68      0.55      0.57      3034
weighted avg       0.70      0.68      0.66      3034


Confusion Matrix:

[[1249  123   23]
 [ 583  766   11]
 [ 140   82   57]]

       MODEL TRAINING WITH CRITERION: 'entropy'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.92      0.45      0.60      1395
Positive (1)       0.58      0.98      0.73      1360
 Neutral (2)       0.68      0.16      0.27       279

    accuracy                           0.66      3034
   macro avg       0.72      0.53      0.53      3034
weighted avg 

In [11]:
# ====================================================================
# ASSUMPTION: X_train, X_test, y_train, y_test are already defined
#             and vectorized from the previous steps.
# ====================================================================

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Define the criteria to check
criteria = ['gini', 'entropy']
# Based on your previous context
target_names = ['Negative (0)', 'Positive (1)', 'Neutral (2)']

print("--- Comparing Decision Tree Criteria with CLASS WEIGHTING ---")

for criterion in criteria:
    print(f"\n=======================================================")
    print(f" MODEL TRAINING: Criterion='{criterion}', Weight='balanced'")
    print(f"=======================================================")

    # 1. Initialize the Decision Tree model
    dt_weighted = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=10,
        random_state=42,
        class_weight='balanced'  # <--- Applying the balanced weight for both criteria
    )

    # 2. Train the model
    dt_weighted.fit(X_train, y_train)

    # 3. Make predictions
    y_pred_weighted = dt_weighted.predict(X_test)

    # 4. Model Evaluation

    # Classification Report
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred_weighted, target_names=target_names, zero_division=0))

    # Confusion Matrix Calculation
    cm = confusion_matrix(y_test, y_pred_weighted)
    print("\nConfusion Matrix:\n")
    print(cm)

    # Visualize the Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Negative', 'Positive', 'Neutral'],
        yticklabels=['Negative', 'Positive', 'Neutral']
    )
    plt.title(f'Decision Tree CM (Criterion: {criterion}, Weighted)')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    # Save the plot
    plot_filename = f'dt_weighted_confusion_matrix_{criterion}.png'
    plt.savefig(plot_filename)
    plt.close()

print("\nWeighted comparison complete. Review the reports to see which criterion best balanced overall accuracy and positive class recall.")

--- Comparing Decision Tree Criteria with CLASS WEIGHTING ---

 MODEL TRAINING: Criterion='gini', Weight='balanced'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.63      0.87      0.73      1395
Positive (1)       0.87      0.41      0.56      1360
 Neutral (2)       0.28      0.47      0.35       279

    accuracy                           0.63      3034
   macro avg       0.60      0.58      0.55      3034
weighted avg       0.71      0.63      0.62      3034


Confusion Matrix:

[[1213   52  130]
 [ 591  563  206]
 [ 119   29  131]]

 MODEL TRAINING: Criterion='entropy', Weight='balanced'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.91      0.46      0.61      1395
Positive (1)       0.60      0.83      0.70      1360
 Neutral (2)       0.27      0.42      0.33       279

    accuracy                           0.62      3034
   macro avg       0.59      0.57      0.55      

# Decision Tree with our Zemberek nlp algorithm

In [13]:
df = df.dropna(subset=['STREM+ZEM'])

In [14]:
df["STREM+ZEM"].isna().sum()

np.int64(0)

In [15]:
# We are doing TF-IDF Vectorization with unigrams and bigrams to convert text data into numerical format
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=20000,
    min_df=3,
    max_df=0.9
)

X = tfidf.fit_transform(df["STREM+ZEM"])
y = df["Sentiment"]

In [16]:
# Splitting the dataset into training, testing and validation sets (60% train, 20% validation, 20% test)
X_temp, X_test, y_temp, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=0.25,
    random_state=42,
    stratify=y_temp
)
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 104739 stored elements and shape (9090, 9390)>

In [17]:
criteria = ['gini', 'entropy']
# Assuming your numerical labels are 0, 1, 2
target_names = ['Negative (0)', 'Positive (1)', 'Neutral (2)']

print("\n--- Comparing Decision Tree Criteria: Gini vs. Entropy ---")

for criterion in criteria:
    print(f"\n=======================================================")
    print(f"       MODEL TRAINING WITH CRITERION: '{criterion}'")
    print(f"=======================================================")

    # Initialize the Decision Tree model (using max_depth=10 for fair comparison)
    dt_classifier = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=10,
        random_state=42
    )

    # Train the model
    dt_classifier.fit(X_train, y_train)

    # Make predictions
    y_pred = dt_classifier.predict(X_test)

    # --- Model Evaluation ---

    # Classification Report
    print("\nClassification Report:\n")
    # Setting zero_division=0 prevents warnings if a class has no predicted samples
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))

    # Confusion Matrix Calculation
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:\n")
    print(cm)

    # Visualize the Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Negative', 'Positive', 'Neutral'],
        yticklabels=['Negative', 'Positive', 'Neutral']
    )
    plt.title(f'Decision Tree Confusion Matrix (Criterion: {criterion})')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.savefig(f'dt_confusion_matrix_{criterion}.png')
    plt.close()


--- Comparing Decision Tree Criteria: Gini vs. Entropy ---

       MODEL TRAINING WITH CRITERION: 'gini'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.62      0.93      0.74      1395
Positive (1)       0.79      0.52      0.63      1357
 Neutral (2)       0.31      0.05      0.09       279

    accuracy                           0.66      3031
   macro avg       0.57      0.50      0.48      3031
weighted avg       0.67      0.66      0.63      3031


Confusion Matrix:

[[1298   86   11]
 [ 636  701   20]
 [ 169   96   14]]

       MODEL TRAINING WITH CRITERION: 'entropy'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.60      0.93      0.73      1395
Positive (1)       0.80      0.47      0.59      1357
 Neutral (2)       0.56      0.14      0.23       279

    accuracy                           0.65      3031
   macro avg       0.65      0.51      0.52      3031
weighted avg 

In [18]:
# ====================================================================
# ASSUMPTION: X_train, X_test, y_train, y_test are already defined
#             and vectorized from the previous steps.
# ====================================================================

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Define the criteria to check
criteria = ['gini', 'entropy']
# Based on your previous context
target_names = ['Negative (0)', 'Positive (1)', 'Neutral (2)']

print("--- Comparing Decision Tree Criteria with CLASS WEIGHTING ---")

for criterion in criteria:
    print(f"\n=======================================================")
    print(f" MODEL TRAINING: Criterion='{criterion}', Weight='balanced'")
    print(f"=======================================================")

    # 1. Initialize the Decision Tree model
    dt_weighted = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=10,
        random_state=42,
        class_weight='balanced'  # <--- Applying the balanced weight for both criteria
    )

    # 2. Train the model
    dt_weighted.fit(X_train, y_train)

    # 3. Make predictions
    y_pred_weighted = dt_weighted.predict(X_test)

    # 4. Model Evaluation

    # Classification Report
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred_weighted, target_names=target_names, zero_division=0))

    # Confusion Matrix Calculation
    cm = confusion_matrix(y_test, y_pred_weighted)
    print("\nConfusion Matrix:\n")
    print(cm)

    # Visualize the Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Negative', 'Positive', 'Neutral'],
        yticklabels=['Negative', 'Positive', 'Neutral']
    )
    plt.title(f'Decision Tree CM (Criterion: {criterion}, Weighted)')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    # Save the plot
    plot_filename = f'dt_weighted_confusion_matrix_{criterion}.png'
    plt.savefig(plot_filename)
    plt.close()

print("\nWeighted comparison complete. Review the reports to see which criterion best balanced overall accuracy and positive class recall.")

--- Comparing Decision Tree Criteria with CLASS WEIGHTING ---

 MODEL TRAINING: Criterion='gini', Weight='balanced'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.61      0.89      0.73      1395
Positive (1)       0.89      0.32      0.47      1357
 Neutral (2)       0.25      0.47      0.33       279

    accuracy                           0.60      3031
   macro avg       0.59      0.56      0.51      3031
weighted avg       0.70      0.60      0.57      3031


Confusion Matrix:

[[1244   24  127]
 [ 666  432  259]
 [ 120   28  131]]

 MODEL TRAINING: Criterion='entropy', Weight='balanced'

Classification Report:

              precision    recall  f1-score   support

Negative (0)       0.58      0.94      0.72      1395
Positive (1)       0.90      0.23      0.37      1357
 Neutral (2)       0.29      0.43      0.35       279

    accuracy                           0.58      3031
   macro avg       0.59      0.54      0.48      