## Loading and Pre-processing our data

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd 
import pprint as pp
import homemade_functions as hf



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\finch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\finch\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Load in our Yelp Reviews dataset

In [3]:
ds = load_dataset('yelp_review_full')

In [4]:
pp.pprint(ds)

{'test': Dataset({
    features: ['label', 'text'],
    num_rows: 50000
}),
 'train': Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})}


In [5]:
pp.pprint(ds['test'])

Dataset({
    features: ['label', 'text'],
    num_rows: 50000
})


In [5]:
pp.pprint(ds['test']['label'][:5])

[0, 0, 0, 0, 0]


In [13]:
pp.pprint(ds['test']['text'][:2])

["I got 'new' tires from them and within two weeks got a flat. I took my car "
 'to a local mechanic to see if i could get the hole patched, but they said '
 'the reason I had a flat was because the previous patch had blown - WAIT, '
 'WHAT? I just got the tire and never needed to have it patched? This was '
 "supposed to be a new tire. \\nI took the tire over to Flynn's and they told "
 'me that someone punctured my tire, then tried to patch it. So there are '
 'resentful tire slashers? I find that very unlikely. After arguing with the '
 "guy and telling him that his logic was far fetched he said he'd give me a "
 'new tire \\"this time\\". \\nI will never go back to Flynn\'s b/c of the way '
 'this guy treated me and the simple fact that they gave me a used tire!',
 "Don't waste your time.  We had two different people come to our house to "
 'give us estimates for a deck (one of them the OWNER).  Both times, we never '
 'heard from them.  Not a call, not the estimate, nothing.']


In [11]:
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])
train_df.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


### Lets Pre-process our data
 - Clean 
    - Remove all puntuation, and make everything the same case
 - Lemmatize
    - Use WordNetLemmatizer to reduce words to there base form
 - Stem
    - Use PorterStemmer to remove word endings to ensure they are similar


we will keep all three as seperate datasets

#### First thing we need to do is adjust our label column from 0-4 into a 0-2 rating system (negative, neutral, positive)

In [12]:
label_map = {
    0:0,
    1:0,
    2:1,
    3:2,
    4:2
}
test_df['label'] = test_df['label'].replace(label_map)
train_df['label'] = train_df['label'].replace(label_map)
print(f"Minimum = {min(test_df['label'])} \nMaximum = {max(test_df['label'])}")

Minimum = 0 
Maximum = 2


In [13]:
test_df['cleaned'] = test_df['text'].apply(hf.clean_tokenize_drop_stops)
print('cleaned')
test_df['stemmed_cleaned'] = test_df['cleaned'].apply(hf.stem)
print('stemmed')
test_df['lemma_cleaned'] = test_df['cleaned'].apply(hf.lemma)
print('lemmad')
test_df.head()

cleaned
stemmed
lemmad


Unnamed: 0,label,text,cleaned,stemmed_cleaned,lemma_cleaned
0,0,I got 'new' tires from them and within two wee...,"[got, new, tires, within, two, weeks, got, fla...","[got, new, tire, within, two, week, got, flat,...","[got, new, tire, within, two, week, got, flat,..."
1,0,Don't waste your time. We had two different p...,"[dont, waste, time, two, different, people, co...","[dont, wast, time, two, differ, peopl, come, h...","[dont, waste, time, two, different, people, co..."
2,0,All I can say is the worst! We were the only 2...,"[say, worst, 2, people, place, lunch, place, f...","[say, worst, 2, peopl, place, lunch, place, fr...","[say, worst, 2, people, place, lunch, place, f..."
3,0,I have been to this restaurant twice and was d...,"[restaurant, twice, disappointed, times, wont,...","[restaur, twice, disappoint, time, wont, go, b...","[restaurant, twice, disappointed, time, wont, ..."
4,0,Food was NOT GOOD at all! My husband & I ate h...,"[food, good, husband, ate, couple, weeks, ago,...","[food, good, husband, ate, coupl, week, ago, f...","[food, good, husband, ate, couple, week, ago, ..."


In [14]:
train_df['cleaned'] = train_df['text'].apply(hf.clean_tokenize_drop_stops)
print('cleaned')
train_df['stemmed_cleaned'] = train_df['cleaned'].apply(hf.stem)
print('stemmed')
train_df['lemma_cleaned'] = train_df['cleaned'].apply(hf.lemma)
print('lemmad')
train_df.head()

cleaned
stemmed
lemmad


Unnamed: 0,label,text,cleaned,stemmed_cleaned,lemma_cleaned
0,2,dr. goldberg offers everything i look for in a...,"[dr, goldberg, offers, everything, look, gener...","[dr, goldberg, offer, everyth, look, gener, pr...","[dr, goldberg, offer, everything, look, genera..."
1,0,"Unfortunately, the frustration of being Dr. Go...","[unfortunately, frustration, dr, goldbergs, pa...","[unfortun, frustrat, dr, goldberg, patient, re...","[unfortunately, frustration, dr, goldberg, pat..."
2,2,Been going to Dr. Goldberg for over 10 years. ...,"[going, dr, goldberg, 10, years, think, one, 1...","[go, dr, goldberg, 10, year, think, one, 1st, ...","[going, dr, goldberg, 10, year, think, one, 1s..."
3,2,Got a letter in the mail last week that said D...,"[got, letter, mail, last, week, said, dr, gold...","[got, letter, mail, last, week, said, dr, gold...","[got, letter, mail, last, week, said, dr, gold..."
4,0,I don't know what Dr. Goldberg was like before...,"[dont, know, dr, goldberg, like, moving, arizo...","[dont, know, dr, goldberg, like, move, arizona...","[dont, know, dr, goldberg, like, moving, arizo..."


In [15]:
stem_train_df = train_df[['label', 'stemmed_cleaned']].rename(columns={'stemmed_cleaned': 'text'})
stem_test_df = test_df[['label', 'stemmed_cleaned']].rename(columns={'stemmed_cleaned': 'text'})

lemm_train_df = train_df[['label', 'lemma_cleaned']].rename(columns={'lemma_cleaned': 'text'})
lemm_test_df = test_df[['label', 'lemma_cleaned']].rename(columns={'lemma_cleaned': 'text'})

clean_train_df = train_df[['label', 'cleaned']].rename(columns={'cleaned': 'text'})
clean_test_df = test_df[['label', 'cleaned']].rename(columns={'cleaned': 'text'})

In [16]:
display(stem_train_df.head())
display(stem_test_df.head())
display(lemm_train_df.head())
display(lemm_test_df.head())

Unnamed: 0,label,text
0,2,"[dr, goldberg, offer, everyth, look, gener, pr..."
1,0,"[unfortun, frustrat, dr, goldberg, patient, re..."
2,2,"[go, dr, goldberg, 10, year, think, one, 1st, ..."
3,2,"[got, letter, mail, last, week, said, dr, gold..."
4,0,"[dont, know, dr, goldberg, like, move, arizona..."


Unnamed: 0,label,text
0,0,"[got, new, tire, within, two, week, got, flat,..."
1,0,"[dont, wast, time, two, differ, peopl, come, h..."
2,0,"[say, worst, 2, peopl, place, lunch, place, fr..."
3,0,"[restaur, twice, disappoint, time, wont, go, b..."
4,0,"[food, good, husband, ate, coupl, week, ago, f..."


Unnamed: 0,label,text
0,2,"[dr, goldberg, offer, everything, look, genera..."
1,0,"[unfortunately, frustration, dr, goldberg, pat..."
2,2,"[going, dr, goldberg, 10, year, think, one, 1s..."
3,2,"[got, letter, mail, last, week, said, dr, gold..."
4,0,"[dont, know, dr, goldberg, like, moving, arizo..."


Unnamed: 0,label,text
0,0,"[got, new, tire, within, two, week, got, flat,..."
1,0,"[dont, waste, time, two, different, people, co..."
2,0,"[say, worst, 2, people, place, lunch, place, f..."
3,0,"[restaurant, twice, disappointed, time, wont, ..."
4,0,"[food, good, husband, ate, couple, week, ago, ..."


### Combine the Datasets back into dictionaries and save them

In [17]:
clean_train = Dataset.from_pandas(clean_train_df)
clean_test = Dataset.from_pandas(clean_test_df)

cleaned_ds = DatasetDict(
    {
        'train': clean_train,
        'test': clean_test
    }
)

stem_train = Dataset.from_pandas(stem_train_df)
stem_test = Dataset.from_pandas(stem_test_df)

stem_ds = DatasetDict(
    {
        'train': stem_train,
        'test': stem_test
    }
)

lemm_train = Dataset.from_pandas(lemm_train_df)
lemm_test = Dataset.from_pandas(lemm_test_df)

lemm_ds = DatasetDict(
    {
        'train': lemm_train,
        'test': lemm_test
    }
)
cleaned_ds.save_to_disk("../data/clean_dataset")
stem_ds.save_to_disk("../data/stem_dataset")
lemm_ds.save_to_disk("../data/lemma_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/650000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/650000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/650000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]