In [None]:
!pip install zemberek-python          #gerekli yüklemeler
!pip install python-Levenshtein

Collecting zemberek-python
  Downloading zemberek_python-0.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting antlr4-python3-runtime==4.8 (from zemberek-python)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading zemberek_python-0.2.3-py3-none-any.whl (95.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141213 sha256=b59c0f93dc7df3f0b960c274391dcd11bbe14b2a7dd1514ad76f8b2e2b4627ae
  Stored in directory: /root/.cache/pip/wheels/a7/20/bd/e1477d664f22d99989fd28ee1a43d6633dddb5cb9

In [None]:
import pandas as pd
import numpy as np
from snowballstemmer import TurkishStemmer
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from zemberek import TurkishSpellChecker, TurkishMorphology, TurkishTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
url = "https://raw.githubusercontent.com/AykutErenSahin/Customer-complaints-classification/main/atm.csv"
data = pd.read_csv(url, sep=';', encoding='cp1254', on_bad_lines='skip')          #türkçe encoding
df = pd.DataFrame(data)

df['Etiket'] = pd.factorize(df['Kategori'], sort=True)[0]      #kategorilere sayı ataması yapıyoruz

In [None]:
from zemberek import (
    TurkishMorphology,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor
)
import re

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()

def preprocess_with_zemberek(text):
    # cleaning
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # tokenize and normalize
    sentences = extractor.from_paragraph(text)
    processed_words = []

    for sentence in sentences:
        normalized = normalizer.normalize(sentence)
        tokens = normalized.split()

        for word in tokens:
            # stemming
            analysis = morphology.analyze(word)
            if analysis.analysis_results:
                word = str(analysis.analysis_results[0].item.lemma)

            processed_words.append(word)

    return ' '.join(processed_words)

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 26.161006689071655


2024-08-14 06:36:51,548 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 26.161006689071655



In [None]:
df['islenmis_Yorum'] = df['Yorum'].apply(preprocess_with_zemberek)

Unnamed: 0,islenmis_Yorum
0,Sakarya 1 tane Atm var o da banka yanında biz ...
1,0 puan vermek amaç Atm kart yuva sokmak ileri ...
2,Rize Albaraka şube Doru
3,diğer banka bakmak ne Atm ne mobil uygulamak ç...
4,ortak Atm yok mağaza geçerli ayrıcalık yok iht...
5,Atm biz yok çok yer
6,Atm men düzce merkez Birtane başka yok sürekli...
7,Albaraka memnun değil
8,qr kod ok hız çok yavaş ve okumak
9,mobil İnternet Par çok zor gitmek bazen bir ye...


In [None]:
complaints = df['islenmis_Yorum']
categories = df['Etiket']

complaints_train, complaints_test, categories_train, categories_test = train_test_split(complaints, categories, test_size= 0.2, random_state=23)

In [17]:
pipeSVC = Pipeline([ ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)), ('clf', LinearSVC()) ])

In [18]:
pipeSVC.fit(complaints_train,categories_train)
predictSVC = pipeSVC.predict(complaints_test)
print("SVC : {:.2f}".format(accuracy_score(categories_test, predictSVC)))

SVC : 0.82




In [20]:
input_file = input("Lütfen csv dosyasının yolunu girin: ")
df_in = pd.read_csv(input_file)

Lütfen csv dosyasının yolunu girin: https://raw.githubusercontent.com/AykutErenSahin/Customer-complaints-classification/main/ATM_test.csv


In [21]:
predict_in = pipeSVC.predict(df_in['Yorum'])
df_in['Etiket'] = predict_in
output_file = input("Lütfen çıktı dosyasının adını girin (örn: sonuclar.csv): ")
df_in.to_csv(output_file, index=False)

print(f"Sınıflandırma tamamlandı. Sonuçlar {output_file} dosyasına kaydedildi.")

Lütfen çıktı dosyasının adını girin (örn: sonuclar.csv): son.csv
Sınıflandırma tamamlandı. Sonuçlar son.csv dosyasına kaydedildi.


In [22]:
print("SVC : {:.2f}".format(accuracy_score(df['Etiket'], predict_in))) # accuracy'i kontrol etme

SVC : 0.80
