## RAG

### 1. Load the data from HuggingFace

In [2]:
from datasets import load_dataset
import pandas as pd

# Load the dataset from 
ds = load_dataset("MedRAG/pubmed")

# Get the train data， first 20k samples
train_data = ds['train']

df = pd.DataFrame(train_data[:20000])

# Save the whole train data to a csv file
df.to_csv('train.csv', index=False)

### 2. Data cleaning

In [None]:
import spacy
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
nlp = spacy.load('en_core_web_sm')

In [None]:
data_raw = pd.read_csv('train.csv')

In [None]:
data_raw.head()

In [None]:
data_raw.iloc[0]['contents']

In [None]:
doc = nlp(data_raw.iloc[0]['contents'])

In [None]:
data_raw['contents'].apply(lambda x: len(x.split(' '))).plot(kind='hist', bins=50)

In [None]:
def token_to_df(doc=doc):
    token_info = []
    for token in doc:
        token_info.append({
            "Text": token.text,
            "Index": token.idx,
            "Whitespace": token.is_space,
            "Is Alphanumeric?": token.is_alpha,
            "Is Punctuation?": token.is_punct,
            "Is Stop Word?": token.is_stop
        })
    return pd.DataFrame(token_info)

token_to_df(doc)

In [None]:
def clean_text(text):
    complete_doc = nlp(text)
    words = [token.text for token in complete_doc 
             if token.is_alpha and not token.is_stop and not token.is_punct]
    return ' '.join(words)

def lemmalize_text(text):
    complete_doc = nlp(text)
    words = [token.lemma_ for token in complete_doc]
    return ' '.join(words)

In [None]:
text = data_raw.iloc[0]['contents']
text = clean_text(text)
text = lemmalize_text(text)
clean_doc = nlp(text)

In [None]:
token_to_df(clean_doc)

In [None]:
for token in doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")

In [None]:
def tokenized_text(text):
    complete_doc = nlp(text)
    words = [token.text.lower() for token in complete_doc]
    return words

In [None]:
clean = data_raw.loc[:, 'contents'].apply(clean_text).apply(lemmalize_text).apply(tokenized_text)
clean

In [None]:
clean.apply(lambda x: len(x)).plot(kind='hist', bins=50)