In [None]:
!pip install datasets

In [None]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm

In [None]:
from datasets import load_dataset
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import multiprocessing as mp
import numpy as np

In [None]:
from datasets import load_dataset

In [None]:
ds = load_dataset('yelp_review_full')

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [None]:
ds['train'][4]

{'label': 0,
 'text': "I don't know what Dr. Goldberg was like before  moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be the

In [None]:
ds['train'].features

{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'text': Value(dtype='string', id=None)}

In [None]:
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [None]:
ds_train.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

# List contents of root directory
!ls /content/drive/My\ Drive

In [None]:
# List contents of a specific folder (replace 'YourFolderName' with actual folder name)
!ls /content/drive/My\ Drive/'Colab Notebooks'/'LLM Project GoogleColab'

'LLM Project GC.ipynb'


# Explore the data frames and preprocess the data

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set up stopwords and spaCy
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Define all preprocessing functions
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc if token.pos_ == 'NOUN']

def preprocess(text):
    text = remove_punctuation(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    return " ".join(tokens)


In [None]:
# Function to apply preprocessing in parallel
def parallel_preprocess(df, num_partitions, num_cores):
    df_split = np.array_split(df, num_partitions)
    pool = mp.Pool(num_cores)
    df = pd.concat(pool.map(apply_preprocessing, df_split))
    pool.close()
    pool.join()
    return df

def apply_preprocessing(df):
    df['cleaned_text'] = df['text'].apply(preprocess)
    return df

# Number of partitions to split dataframe and number of CPU cores
num_partitions = 10
num_cores = mp.cpu_count()

# Apply preprocessing in parallel and save to Google Drive
ds_train = parallel_preprocess(ds_train, num_partitions, num_cores)
ds_test = parallel_preprocess(ds_test, num_partitions, num_cores)

# Save the processed dataframes to Google Drive
ds_train.to_csv(r"/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Data/cleaned_ds_train.csv", index=False)
ds_test.to_csv(r"/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Data/cleaned_ds_test.csv", index=False)

In [None]:
ds_train.head()

Unnamed: 0,label,text,cleaned_text
0,4,dr. goldberg offers everything i look for in a...,practitioner talk time patient topnotch hospit...
1,1,"Unfortunately, the frustration of being Dr. Go...",repeat experience doctor doctor staff staff ph...
2,3,Been going to Dr. Goldberg for over 10 years. ...,year patient year picture fibroid option under...
3,3,Got a letter in the mail last week that said D...,letter mail week position nni think doctor try...
4,0,I don't know what Dr. Goldberg was like before...,doctor office doctor medication refill month r...


In [None]:
ds_train.head()

Unnamed: 0,label,text,cleaned_text
0,4,dr. goldberg offers everything i look for in a...,practitioner talk time patient topnotch hospit...
1,1,"Unfortunately, the frustration of being Dr. Go...",repeat experience doctor doctor staff staff ph...
2,3,Been going to Dr. Goldberg for over 10 years. ...,year patient year picture fibroid option under...
3,3,Got a letter in the mail last week that said D...,letter mail week position nni think doctor try...
4,0,I don't know what Dr. Goldberg was like before...,doctor office doctor medication refill month r...


# Convert the data frames back to datasets for easier use with the Hugging Face API

In [None]:
from datasets import Dataset, DatasetDict
ds_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Data/cleaned_ds_train.csv')
ds_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Data/cleaned_ds_test.csv')

# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)
# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
new_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'cleaned_text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text', 'cleaned_text'],
        num_rows: 50000
    })
})

In [None]:
# Save the dataset to Google Drive
save_path = '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Data/new_ds'
new_ds.save_to_disk(save_path)