## <center> BİTİRME PROJESİ A </center>

In [52]:
import pandas as pd
import csv
from typing import List
import string
from jpype import JClass, getDefaultJVMPath, startJVM
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import gensim 
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
df = pd.read_csv("assets/comments/comment.csv")
df.head(100)

In [None]:
#Türkçe Gereksiz Kelimeler
with open('assets/turkish-stop-words.txt', 'r', encoding='utf-8') as file:
    stopWords = file.read()
    print(stopWords)

In [4]:
#Zembere JVM Başlatma
ZEMBEREK_PATH = r'zemberek/zemberek-full.jar' 
startJVM(getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))

TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
morphology = TurkishMorphology.createWithDefaults()

In [None]:
#Metin Ön İşleme
inputFilePath = 'assets/comments/comment.csv'
outputFilePath = 'assets/processed_comments/processed_comment.txt'

with open(outputFilePath, 'w', encoding='utf-8') as file:
    file.write('')

with open(inputFilePath, 'r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    headers = next(csv_reader)
    try:
        text_column_index = headers.index('text')
    except ValueError:
        raise ValueError(f"text sütunu bulunamadı.")

    lines = [line[text_column_index] for line in csv_reader]

for line in lines:
    translator = str.maketrans('', '', string.punctuation)
    wordsWithoutPunctuation = line.translate(translator)

    analysisWords = morphology.analyzeAndDisambiguate(wordsWithoutPunctuation).bestAnalysis()

    

    pos: List[str] = []
    for i, analysis in enumerate(analysisWords, start=1):
        print(f'Analiz {i}: {analysis}')


        pos.append(str(analysis.getLemmas()[0]))

    withoutUnk = [word for word in pos if word != 'UNK']
    withoutStopWords = [word for word in withoutUnk if word.lower() not in stopWords]
    withoutNumericals = [word for word in withoutStopWords if not word.isdigit()]

    result_content = " ".join(withoutNumericals)

    with open(outputFilePath, 'a', encoding='utf-8') as file:
        file.write(result_content + '\n')

print(f'Analiz edilen kelimeler (UNK olmayanlar) dosyaya yazıldı: {outputFilePath}')

txtFilePath = 'assets/processed_comments/processed_comment.txt'
csvFilePath = 'assets/processed_comments/processed_comment.csv'
with open(txtFilePath, 'r', encoding='utf-8') as txtfile, open(csvFilePath, 'w', newline='', encoding='utf-8') as csvfile:        
    lines = txtfile.readlines()
    
    csvWriter = csv.writer(csvfile)
    csvWriter.writerow(['text'])

    for i, line in enumerate(lines, start=1):
        csvWriter.writerow([line.strip()])

In [None]:
#Ön İşlemden Sonra Yorumlar
processed_df = pd.read_csv('assets/processed_comments/processed_comment.csv')
processed_df.head(100)

In [None]:
#Kelime Bulutu Haritası
df = pd.read_csv('assets/processed_comments/processed_comment.csv')
text_data = ' '.join(df['text'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# <center>LDA MODELİ</center>

In [None]:
topicCountRange = range(8,30,2)

coherenceCountList = list()
topicCountList = list()

#LDA Model Parametreleri
df = pd.read_csv('assets/processed_comments/processed_comment.csv')
tokenized = [comment.split() for comment in df["text"].astype(str)]
dictionary = corpora.Dictionary(tokenized)
dictionary.filter_extremes(no_below=1, no_above=0.7)
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]

for topicCount in topicCountRange:
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, topicCount, id2word=dictionary, passes=30)
    coherenceModelLda = CoherenceModel(model=ldamodel, texts=tokenized, dictionary=dictionary, coherence='c_v')
    mockCoherenceModelLda = coherenceModelLda.get_coherence()
    coherenceCountList.append(mockCoherenceModelLda)
    topicCountList.append(topicCount)

print(coherenceCountList)
print(topicCountList)

In [None]:
#En iyi tutarlılık sonucunu veren topic sayısı grafiği
plt.plot(topicCountList, coherenceCountList, '-')
plt.xlabel('Topic Sayısı')
plt.ylabel('Tutarlılık Skoru')
plt.show()

In [63]:
#LDA Model Parametreleri
df = pd.read_csv('assets/processed_comments/processed_comment.csv')
tokenized = [comment.split() for comment in df["text"].astype(str)]
dictionary = corpora.Dictionary(tokenized)
dictionary.filter_extremes(no_below=1, no_above=0.7)
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]

#LDA Modelini Eğitme
topicCount = 8 #Topic Sayısı
ldamodel = gensim.models.ldamodel.LdaModel(corpus, topicCount, id2word=dictionary, passes=30, alpha='auto', eta='auto')
topics = ldamodel.print_topics(num_words=15)

In [None]:
#LDA Modelinin Tutarlılık Skoru
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('LDA Tutarlılık Skoru: ', coherence_lda)

In [None]:
#LDA Topicleri
for topic in topics:
    print(topic)

In [None]:
#LDA Topiclerini Dataframe Ekleme
topicsDf = pd.DataFrame(topics, columns=['topic_id', 'top_words'])
topicsDf.to_csv('results/lda/topicler.csv', index=False)
topicsDf = pd.read_csv('results/lda/topicler.csv')
topicsDf.head(8)

In [69]:
topicDistsList = []
documentTopicsList = []
for i, docTopics in enumerate(ldamodel[corpus]):
    dominantTopic = max(docTopics, key=lambda x: x[1])[0]
    topicDistsList.append((df.iloc[i]["text"], dominantTopic))
    documentTopics = ldamodel.get_document_topics(corpus[i])
    documentTopicsList.append(documentTopics)
    
#Topiclerle yorumları eşleştirme
topicDistsDf = pd.DataFrame(topicDistsList, columns=['document_text', 'topic_id'])
topicDistsDf.to_csv('results/lda/yorumlarda_topic_dagilimlari.csv', index=False)

#Topiciclerin yorumlardaki skoru
documentTopicsDf = pd.DataFrame(documentTopicsList, columns=[f'topic_{i}' for i in range(ldamodel.num_topics)])
documentTopicsDf.insert(0, 'document_id', range(1, len(documentTopicsDf) + 1))
documentTopicsDf.to_csv('results/lda/topic_dagilimlari.csv', index=False)

In [None]:
#Topiclerle yorumları eşleştirme dataframe
topicDistsDf = pd.read_csv('results/lda/yorumlarda_topic_dagilimlari.csv')
topicDistsDf.head(100)

In [None]:
#Topiciclerin yorumlardaki skoru dataframe
documentTopicsDf = pd.read_csv('results/lda/topic_dagilimlari.csv')
documentTopicsDf.head(100)

In [None]:
df = pd.read_csv('results/lda/topic_dagilimlari.csv')
documentId = 28

topicColumns = [col for col in df.columns if col.startswith('topic_')]
topicCount = int(len(topicColumns))

topicColumns = [f'topic_{i}' for i in range(topicCount-1)] 
topicIds = []
topicScores = []

for col in topicColumns:
    topicInfo = df[df['document_id'] == documentId][col].values[0]
    
    if isinstance(topicInfo, float):
        topicInfo = str(topicInfo)
        
    if ',' in topicInfo:
        topicIdStr, scoreStr = topicInfo.split(",")
        topicId = int(topicIdStr.strip("()"))
        score = float(scoreStr.strip(")"))
        topicIds.append(topicId)
        topicScores.append(score)

fig, ax = plt.subplots()
bars = plt.bar(topicIds, topicScores, color='black', alpha=0.3)
plt.title(f"{documentId} idli Dökümanın Topic Skor Grafiği - LDA")
plt.xlabel("Topic ID")
plt.ylabel("Skor")

plt.show()

In [None]:
data = pd.read_csv("results/lda/topicler.csv")
mostDominantTopicId = topicDistsDf['topic_id'].value_counts().idxmax()
print("En fazla eşleşen topic: ", mostDominantTopicId)
goal = data[data['topic_id'] == mostDominantTopicId][['top_words']]
if not goal.empty:
    print("Kelimeler: ", goal.iloc[0]['top_words'])
else:
    print(f"{str(mostDominantTopicId)} idli topic bulunamadı.")

In [None]:
resultsDf = pd.read_csv('results/lda/yorumlarda_topic_dagilimlari.csv')
topicCount = resultsDf['topic_id'].nunique()
topicDists = resultsDf['topic_id'].value_counts().sort_index()

#Bar Grafiği
plt.figure(figsize=(10, 6))
plt.bar(topicDists.index, topicDists.values, color='black', alpha=0.3)
plt.title('Dökümanlardaki Topic Dağılımı - LDA')
plt.xlabel('Topic ID')
plt.xticks(ticks=range(0, topicCount), labels=range(0, topicCount))
plt.ylabel('Döküman Sayısı')

#Çubuk Grafiği
plt.figure(figsize=(10, 6))
plt.plot(range(len(topicDists)), topicDists, marker='o', linestyle='-', color='black', label='Topic Dağılımı')
plt.title('Dökümanlardaki Topic Dağılımı - LDA')
plt.xlabel('Topic ID')
plt.xticks(ticks=range(0, topicCount), labels=range(0, topicCount))
plt.ylabel('Döküman Sayısı')
plt.legend()

#Pasta Grafiği
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
ax1.pie(topicDists.values, labels=topicDists.index, autopct='%1.1f%%', startangle=90, colors=plt.cm.tab10.colors)
ax1.set_title('Dökümanlardaki Topic Dağılımı - LDA')

ax2.text(0.1, 0.9, 'Topicler:', fontsize=8, weight='bold')
ax2.set_xticks([])
ax2.set_yticks([])

for i, topic_id in enumerate(topicDists.index):
    ax2.text(0.1, 0.8 - i * 0.03, f'Topic {topic_id}, Eşleştiği Döküman Sayısı: {topicDists[topic_id]}', fontsize=8)

plt.show()