In [None]:
import time
import logging

from zemberek import (
    TurkishSpellChecker,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishTokenizer
)

import pandas as pd
from zemberek import TurkishMorphology, TurkishSentenceNormalizer

import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re
import os
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import csv
from openpyxl import Workbook
from datetime import datetime
import seaborn as sns
import pandas as pd

In [None]:
df = pd.read_excel("veriseti.xlsx")
df.head()

In [None]:
nan_indices = df['normalized_text'].isnull()
print(df[nan_indices])
df = df.dropna(subset=['normalized_text'])
print(df[nan_indices])

In [None]:
df['normalized_text'] = df['normalized_text'].str.lower()
df['normalized_text'] = df['normalized_text'].astype(str)
df['normalized_text'] = df['normalized_text'].apply(lambda x: re.sub('[^a-zA-ZğüşıöçĞÜŞİÖÇ]', ' ', x))
df['normalized_text'] = df['normalized_text'].apply(lambda x: re.sub(r'\d+', '', x))
df['normalized_text'] = df['normalized_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [None]:
print(df.shape)
df.head(20)

In [None]:
morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
df['normalized_text'] = df['normalized_text'].apply(lambda x: normalizer.normalize(x))

In [None]:
from nltk.corpus import stopwords

turkce_stopwords = set(stopwords.words('turkish'))

def remove_stopwords(text):
    text = ' '.join(word for word in text.split() if word not in turkce_stopwords)
    return text

df['normalized_text'] = df['normalized_text'].apply(remove_stopwords)

comment_words = ''
stopwords = set(stopwords.words('english')).union(turkce_stopwords)

for val in df.normalized_text:
    val = str(val)
    tokens = val.split()
    
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    
    comment_words

stopwords

In [None]:

from nltk import ngrams

def get_ngrams(text, n):
    n_grams = ngrams(nltk.word_tokenize(text), n)
    return [ ' '.join(grams) for grams in n_grams]

df['unigrams'] = df['normalized_text'].apply(lambda x: get_ngrams(x, 1))
df['bigrams'] = df['normalized_text'].apply(lambda x: get_ngrams(x, 2))
df['trigrams'] = df['normalized_text'].apply(lambda x: get_ngrams(x, 3))


def plot_ngrams(df, column, title):
    freq_dict = {}
    for row in df[column]:
        for ngram in row:
            if ngram in freq_dict:
                freq_dict[ngram] += 1
            else:
                freq_dict[ngram] = 1
    freq_df = pd.DataFrame(list(freq_dict.items()), columns=['ngram', 'frequency'])
    freq_df = freq_df.sort_values(by='frequency', ascending=False).reset_index(drop=True)
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x='ngram', y='frequency', data=freq_df.head(20))
    ax.set(title=title)
    plt.xticks(rotation=45, ha='right')
    plt.show()

plot_ngrams(df, 'unigrams', 'Top 20 unigrams')
plot_ngrams(df, 'bigrams', 'Top 20 bigrams')
plot_ngrams(df, 'trigrams', 'Top 20 trigrams')

In [None]:

nltk.download('stopwords')

from nltk.corpus import stopwords

turkce_stopwords = set(stopwords.words('turkish'))

additional_stopwords = ["bir", "i"]

# Remove stopwords function
def remove_stopwords(text, stopwords):
    text = ' '.join(["fiyat" if word == "f" else "performans" if word == "p" else word for word in text.split() if word not in stopwords])
    return text

# Additional stopwords'ları da kullanarak stopword'leri kaldıralım
df['normalized_text'] = df['normalized_text'].apply(lambda x: remove_stopwords(x, turkce_stopwords.union(additional_stopwords)))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vect.fit_transform(df['normalized_text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vect.get_feature_names())

result_df = pd.concat([df['Rating'], tfidf_df], axis=1)

result_df

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, log_loss, matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier

X = result_df.drop('Rating', axis=1)
y = result_df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

models = [
    RandomForestClassifier(),
    LGBMClassifier(),
    GradientBoostingClassifier()
]

predictions_df = pd.DataFrame({'Rating': y_test})
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    predictions_df[model.__class__.__name__] = y_pred
    
    cm = confusion_matrix(y_test, y_pred)
    print(model.__class__.__name__, "Confusion Matrix:")
    print(cm)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(model.__class__.__name__, "Total Accuracy:", accuracy)

    unique_classes = y_train.unique()
    for unique_class in unique_classes:
        accuracy = accuracy_score(y_test[y_test == unique_class], y_pred[y_test == unique_class])
        print(model.__class__.__name__, "Accuracy_", unique_class, ":", accuracy)
    
    
    y_pred_proba = model.predict_proba(X_test)
    logloss = log_loss(y_test, y_pred_proba)
    print(model.__class__.__name__, "Log Loss:", logloss)
    
    mcc = matthews_corrcoef(y_test, y_pred)
    print(model.__class__.__name__, "MCC:", mcc)
    
    print("------------------------------")

result_df = pd.concat([result_df, predictions_df], axis=1)



In [None]:

import gradio as gr

def predict_rating(yorum):
    # Yorumu TF-IDF dönüşümüne tabi tutma
    yorum_tfidf = tfidf_vect.transform([yorum])

    # Modelle tahmin yapma
    y_pred = model.predict(yorum_tfidf)[0]

    # Tahmin edilen sınıfı bir yıldız olarak göster
    ratings = {
        1: "⭐️",
        2: "⭐️⭐️",
        3: "⭐️⭐️⭐️",
        4: "⭐️⭐️⭐️⭐️",
        5: "⭐️⭐️⭐️⭐️⭐️"
    }

    rating = ratings[y_pred]

    return rating

# Gradio arayüzünü oluşturma
inputs = gr.inputs.Textbox(label="Yorum")
outputs = gr.outputs.Textbox(label="Tahmin Edilen Puan")
interface = gr.Interface(fn=predict_rating, inputs=inputs, outputs=outputs)

# Arayüzü başlatma
interface.launch()