In [3]:
import pandas as pd

# Nama file CSV
csv_filename = "scholar_titles_links.csv"

# Membaca file CSV
try:
    df = pd.read_csv(csv_filename, encoding="utf-8")
    print(df.head())  # Menampilkan 5 baris pertama
except FileNotFoundError:
    print(f"❌ File '{csv_filename}' tidak ditemukan.")
except Exception as e:
    print(f"⚠️ Terjadi kesalahan: {e}")


                                               Title  \
0  I enjoy writing and playing, do you?: a person...   
1  Teaching students about conversational ai usin...   
2  Towards an online empathetic chatbot with emot...   
3  Compeer: A generative conversational agent for...   
4  Conversational AI-Chatbot Architectures and Ev...   

                                                Link  Year  \
0  https://ieeexplore.ieee.org/abstract/document/...  2022   
1  https://ieeexplore.ieee.org/abstract/document/...  2021   
2  https://dl.acm.org/doi/abs/10.1145/3404835.346...  2021   
3  https://dl.acm.org/doi/abs/10.1145/3654777.367...  2024   
4  https://sydneyacademics.com/index.php/ajmlra/a...  2021   

                                            Abstract  
0  conversational agent to communicate with its u...  
1  CONVO to train ML models and create conversati...  
2  To answer these sub-questions, we develop an E...  
3  generative agents to improve people’s mental h...  
4  studies, to 

Lowercasing: Mengubah teks menjadi huruf kecil.
Removing Punctuation & Special Characters: Menghapus tanda baca.
Removing Numbers: Menghapus angka dari teks.
Tokenization: Memisahkan teks menjadi kata-kata.
Stopword Removal: Menghapus kata-kata umum yang tidak memiliki makna penting.
Lemmatization: Mengubah kata menjadi bentuk dasarnya.

In [4]:
import nltk

# Download the required resources again
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

print("✅ All necessary NLTK resources have been downloaded.")

✅ All necessary NLTK resources have been downloaded.


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Unity_Comp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Unity_Comp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Unity_Comp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Unity_Comp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Inisialisasi Lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """
    Fungsi untuk membersihkan teks (hanya untuk kolom Title dan Abstract).
    """
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join back into a string
    cleaned_text = " ".join(tokens)

    return cleaned_text





In [6]:
try:

    # Bersihkan hanya kolom "Title" dan "Abstract"
    df["Title"] = df["Title"].apply(clean_text)
    df["Abstract"] = df["Abstract"].apply(clean_text)

    # Simpan kembali ke CSV
    cleaned_csv_filename = "cleaned_scholar_titles_links.csv"
    df.to_csv(cleaned_csv_filename, index=False, encoding="utf-8")

    print(f"\n✅ Preprocessing selesai! Data disimpan ke '{cleaned_csv_filename}'.")
    print(df.head())  # Tampilkan beberapa hasil pertama

except FileNotFoundError:
    print(f"❌ File '{csv_filename}' tidak ditemukan.")
except Exception as e:
    print(f"⚠️ Terjadi kesalahan: {e}")



✅ Preprocessing selesai! Data disimpan ke 'cleaned_scholar_titles_links.csv'.
                                               Title  \
0  enjoy writing playing personalized emotion gro...   
1  teaching student conversational ai using convo...   
2    towards online empathetic chatbot emotion cause   
3  compeer generative conversational agent proact...   
4  conversational aichatbot architecture evaluati...   

                                                Link  Year  \
0  https://ieeexplore.ieee.org/abstract/document/...  2022   
1  https://ieeexplore.ieee.org/abstract/document/...  2021   
2  https://dl.acm.org/doi/abs/10.1145/3404835.346...  2021   
3  https://dl.acm.org/doi/abs/10.1145/3654777.367...  2024   
4  https://sydneyacademics.com/index.php/ajmlra/a...  2021   

                                            Abstract  
0  conversational agent communicate user assist u...  
1  convo train ml model create conversational app...  
2  answer subquestions develop empathetic chat

In [7]:
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

# 📂 Baca file CSV
csv_filename = "cleaned_scholar_titles_links.csv"

try:
    df = pd.read_csv(csv_filename, encoding="utf-8")

    # Bersihkan hanya kolom "Title" dan "Abstract"
    df["Title"] = df["Title"].astype(str).apply(clean_text)
    df["Abstract"] = df["Abstract"].astype(str).apply(clean_text)

    # Gabungkan teks dari kolom Title & Abstract
    combined_text = " ".join(df["Title"]) + " " + " ".join(df["Abstract"])

    # Hitung frekuensi kata
    word_counts = Counter(combined_text.split())

    # Ambil 10 kata paling sering muncul
    most_common_words = word_counts.most_common(10)

    print("\n📊 10 Kata Paling Sering Muncul:")
    for word, count in most_common_words:
        print(f"{word}: {count}")

    # 📌 Buat WordCloud
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(combined_text)

    # Tampilkan WordCloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Word Cloud - Frekuensi Kata", fontsize=14)
    plt.show()

except FileNotFoundError:
    print(f"❌ File '{csv_filename}' tidak ditemukan.")
except Exception as e:
    print(f"⚠️ Terjadi kesalahan: {e}")








ModuleNotFoundError: No module named 'matplotlib'