In [None]:
!pip install zemberek-python          #gerekli yüklemeler
!pip install python-Levenshtein

In [None]:
import pandas as pd
import numpy as np
from snowballstemmer import TurkishStemmer
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('turkish'))
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from zemberek import TurkishSpellChecker, TurkishMorphology, TurkishTokenizer, TurkishSentenceNormalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import re

In [44]:
morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()

def preprocess_with_zemberek(text):
    stop_words = set(stopwords.words('turkish'))

    # Temizleme
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)        # Noktalama işaretlerini kaldır
    text = re.sub(r'\d+', '', text)            # Sayıları kaldır
    text = re.sub(r'\s+', ' ', text).strip()   # Birden fazla boşlukları tek boşlukla değiştir

    # Cümlelere ayırma
    sentences = extractor.from_paragraph(text)
    processed_words = []

    for sentence in sentences:
        # Normalizasyon
        normalized = normalizer.normalize(sentence)
        tokens = normalized.split()

        for word in tokens:
            # Stop words kontrolü
            if word not in stop_words:
                # Stemming
                analysis = morphology.analyze(word)
                if analysis.analysis_results:
                    word = str(analysis.analysis_results[0].item.lemma)
                processed_words.append(word)

    return ' '.join(processed_words)

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 30.078245639801025


2024-08-14 14:17:19,631 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 30.078245639801025



In [45]:
def train_and_predict(input_file_link, output_file_link):

    data = pd.read_csv(input_file_link, sep=';', encoding='cp1254', on_bad_lines='skip')          #türkçe encoding
    df = pd.DataFrame(data)

    pred_data = pd.read_csv(output_file_link, sep=';', encoding='utf-8', on_bad_lines='skip')          #türkçe encoding
    pred_df = pd.DataFrame(pred_data)

    df['Etiket'] = pd.factorize(df['Kategori'], sort=True)[0]             #kategorilere sayı ataması yapıyoruz
    etiket_to_kategori = dict(enumerate(df['Kategori'].unique()))
    df['islenmis_Yorum'] = df['Yorum'].apply(preprocess_with_zemberek)

    X = df['islenmis_Yorum']
    y = df['Etiket']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

    pipeSVC = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)) , ('clf', LinearSVC())])

    pipeSVC.fit(X_train, y_train)
    predictSVC = pipeSVC.predict(X_test)
    print("SVC : {:.2f}".format(accuracy_score(y_test, predictSVC)))

    pred_df['islenmis_Yorum'] = pred_df['Yorum'].apply(preprocess_with_zemberek)
    tahmin_etiketler = pipeSVC.predict(pred_df['islenmis_Yorum'])
    pred_df['Tahmin_Kategori'] = [etiket_to_kategori[etiket] for etiket in tahmin_etiketler]

    pred_df.to_csv('tahmin_sonucu.csv', index=False)

In [46]:
input_file = input("Lütfen eğitim CSV dosyasının yolunu girin: ")
output_file = input("Lütfen tahmin edilecek CSV dosyasının yolunu girin: ")

train_and_predict(input_file,output_file)



Lütfen eğitim CSV dosyasının yolunu girin: https://raw.githubusercontent.com/AykutErenSahin/Customer-complaints-classification/main/atm.csv
Lütfen tahmin edilecek CSV dosyasının yolunu girin: https://raw.githubusercontent.com/AykutErenSahin/Customer-complaints-classification/main/ATM_test.csv




SVC : 0.80
SVC : 0.45


In [58]:

https://raw.githubusercontent.com/AykutErenSahin/Customer-complaints-classification/main/atm.csv
https://raw.githubusercontent.com/AykutErenSahin/Customer-complaints-classification/main/ATM_test.csv