# Download Scrapped Data

In [1]:
# # Run this if you want to download the dataset
# !pip install gdown
# import gdown
# gdown.download('https://drive.google.com/uc?id=1-9IKXHKjXVStQiJiq1j4ZruDfRnLs0FP', 'combined.csv', quiet=False)

# Preprocess the Data

In [2]:
import pandas as pd

df = pd.read_csv('combined.csv')
shuffled_light_df = df.sample(90000)

In [3]:
shuffled_light_df.head()

Unnamed: 0,Question,Answer
122265,"dok,mau tanya jika penyakit DMD itu apa yah ba...","Hallo Bambang, Terimakasih sudah bertanya ke A..."
225473,"Siang dok , saya mau tanya . apakah susu hamil...",Hai\nMengkonsumsi saat hamil tentunya memiliki...
107998,"Hai dok, saya punya bayi usia 1bulan ini anak ...","Salam Alodokter,\nTerima kasih sudah bertanya ..."
37723,Selamat sore dok. hasil eeg anak saya di diagn...,"Alo, selamat sore yoga\ndarai hasil EEG si adi..."
94604,"Pagi dok, ayah saya kemarin malem habis pemasa...","Alo, terimakasih atas pertanyaannya.\nUrin ber..."


In [4]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Drop rows with missing data in column: 'Question'
    df = df.dropna(subset=['Question'])
    # Drop rows with missing data in column: ' Answer'
    df = df.dropna(subset=[' Answer'])
    # Drop duplicate rows in column: 'Question'
    df = df.drop_duplicates(subset=['Question'])
    # Drop duplicate rows in column: ' Answer'
    df = df.drop_duplicates(subset=[' Answer'])
    # Convert text to lowercase in columns: 'Question', ' Answer'
    df['Question'] = df['Question'].str.lower()
    df[' Answer'] = df[' Answer'].str.lower()
    return df

df_cleaned_missing_values = clean_data(shuffled_light_df.copy())
df_cleaned_missing_values.head()

Unnamed: 0,Question,Answer
122265,"dok,mau tanya jika penyakit dmd itu apa yah ba...","hallo bambang, terimakasih sudah bertanya ke a..."
225473,"siang dok , saya mau tanya . apakah susu hamil...",hai\nmengkonsumsi saat hamil tentunya memiliki...
107998,"hai dok, saya punya bayi usia 1bulan ini anak ...","salam alodokter,\nterima kasih sudah bertanya ..."
37723,selamat sore dok. hasil eeg anak saya di diagn...,"alo, selamat sore yoga\ndarai hasil eeg si adi..."
94604,"pagi dok, ayah saya kemarin malem habis pemasa...","alo, terimakasih atas pertanyaannya.\nurin ber..."


In [5]:
def shorten_text(text, max_length=4000):
    if len(text) <= max_length:
        return text
    shortened_text = text[:max_length]
    last_period_index = shortened_text.rfind('.')
    if last_period_index != -1:
        return shortened_text[:last_period_index + 1]
    else:
        return shortened_text.rstrip() + '.'

def clean_identity_info(df):
    # remove unfaedah
    df[' Answer'] = df[' Answer'].str.replace("\S*\.com\S*", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("halo[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hai[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("alo[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hallo[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hei[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hay[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("terimakasih[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("terima kasih[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("halo[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hai[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("alo[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hallo[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hei[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hay[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("terima kasih[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("dear[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("salam[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("asalamualatium[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("waalaikumsalam[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("assalamualaikum[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("sore[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("selamat pagi[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("terima kasih[,.\n\s]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("pagi[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("ass[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("dok[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hi[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("hy[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("selamat siang[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("selamat malam[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("malam[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
    df[' Answer'] = df[' Answer'].str.replace("-->", "-")
    df['Question'] = df['Question'].str.replace("(Selamat )?(pagi|siang|sore|malam)( dokter| dok)?[,.\n\s]", "", case=False, regex=True)
    df['Question'] = df['Question'].str.replace("(halo|hi|hai|hallo|permisi|assalamualaikum|salam|misi|perkenalkan|maaf|assalamu'alaikum|assalamualaiku)( wr.wb.)?( dokter| dok)?[,.\n\s]", "", case=False, regex=True)
    df['Question'] = df['Question'].str.replace("(saya |dok )?(mau|izin|ijin|numpang|ingin) (tanya|brtanya|bertanya|konsul|nanya)( ya| nih| dong)?( dokter| dok)?[,.\n\s]", "", case=False, regex=True)
    df['Question'] = df['Question'].str.replace("alodokter|alodok[.,\n\s]", "", case=False, regex=True)

    # cleaning dr name
    df[' Answer'] = df[' Answer'].str.replace("dr\\.[^.,]*", "", case=False, regex=True)
    
    #shorten dataset
    df[' Answer'] = df[' Answer'].apply(lambda x: x[:4500] if len(x) > 4500 else x)
    
    return df

cleaned_data = clean_identity_info(df_cleaned_missing_values.copy())
cleaned_data = cleaned_data[cleaned_data[" Answer"].apply(len) >= 500]
cleaned_data = cleaned_data[cleaned_data[" Answer"].apply(len) <= 1700]
cleaned_data = cleaned_data[cleaned_data["Question"].apply(len) <= 400]
cleaned_data.head()

  df[' Answer'] = df[' Answer'].str.replace("\S*\.com\S*", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("halo[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("hai[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("alo[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("hallo[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("hei[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("hay[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("terimakasih[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("terima kasih[\s,][a-z \s]+[.,\n]", "", case=False, regex=True)
  df[' Answer'] = df[' Answer'].str.replace("halo[,.\n\s]", "", case=False, regex=True)
  df[' Answer

Unnamed: 0,Question,Answer
225473,", . apakah susu hamil yang sudah diseduh lalu ...",mengkonsumsi saat hamil tentunya memiliki manf...
37723,hasil eeg anak saya di diagnosa abnormal : pe...,namun ada tampaknya disfungsi kortikal yang m...
94604,ayah saya kemarin malem habis pemasangan kate...,\nurin berdarah (hematuria) pada pria bisa ban...
71162,saya ada keluhan tenggorokan sudah hampir 2 ta...,dimana keluhan sakit tenggorokan ini banyak d...
78588,dok. saya sudah periksa untuk gejalanya magh d...,\ngangguan penciuman dan pengecapan disertai s...


# Export Cleaned Data to CSV

In [6]:
## Preprocess dataframe
rename_mapping = {'Question': 'instruction', ' Answer': 'output'}
cleaned_data = cleaned_data.rename(columns=rename_mapping)
cleaned_data['input'] = [''] * len(cleaned_data)

In [7]:
import json

# Assuming you have a DataFrame named df
data_list = cleaned_data.to_dict(orient='records')

# Save the list of dictionaries as a JSON file
with open('output2.json', 'w') as file:
    json.dump(data_list, file, indent=4)

In [8]:
print(cleaned_data[50000:50010])

Empty DataFrame
Columns: [instruction, output, input]
Index: []
