In [None]:
import os
os.getcwd()

In [1]:
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

plt.style.use("fivethirtyeight")

## 1: Load data

In [3]:
df1 = pd.read_csv('../data/Reviews.csv')

df = pd.DataFrame({'text': df1.Text, 'score': df1.Score})
del df1
df

Unnamed: 0,text,score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5
...,...,...
568449,Great for sesame chicken..this is a good if no...,5
568450,I'm disappointed with the flavor. The chocolat...,2
568451,"These stars are small, so you can give 10-15 o...",5
568452,These are the BEST treats for training and rew...,5


In [2]:
df2 = pd.read_csv('../data/own_dataset.csv')
df = pd.DataFrame({'text': df2.text, 'score': df2.stars})

del df2
df

Unnamed: 0,text,score
0,The description says &#34;Maybe it&#39;s the c...,1
1,This was terrible. I followed the recipe exac...,1
2,I&#39;m always amazed how people rate a recipe...,1
3,"didn&#39;t like it - too creamy, too rich - i ...",1
4,"Even using only 1 can of green chilies, it was...",1
...,...,...
1407,SO GOOD! I omitted the green chilies because w...,5
1408,I cut the calories and fat way down by using n...,5
1409,Wonderful recipe!!!! I used 8oz light cream ch...,5
1410,This was so good. The only thing I would do i...,5


## 2: Data Preprocessing

### 2.1: Text Cleaning

In [3]:
# 2.1.1 Lower Case
df.text = [t.lower() for t in tqdm(df.text)]

# 2.1.2 Remove Punctuation
df.text = [t.translate(str.maketrans('','',string.punctuation)) for t in tqdm(df.text)]

# 3.1.3: Remove Special Characters
df.text = [" ".join(e for e in t.split() if e.isalnum()) for t in tqdm(df.text)]

# 3.1.4: Remove Digits
df.text = [re.sub(r'\d+', '', t) for t in tqdm(df.text)]

df.text

100%|██████████| 1412/1412 [00:00<00:00, 1409078.57it/s]
100%|██████████| 1412/1412 [00:00<00:00, 282447.41it/s]
100%|██████████| 1412/1412 [00:00<00:00, 156904.42it/s]
100%|██████████| 1412/1412 [00:00<00:00, 235350.39it/s]


0       the description says maybe its the cream chees...
1       this was terrible i followed the recipe exactl...
2       im always amazed how people rate a recipe  and...
3       didnt like it too creamy too rich i didnt feel...
4       even using only  can of green chilies it was m...
                              ...                        
1407    so good i omitted the green chilies because we...
1408    i cut the calories and fat way down by using n...
1409    wonderful recipe i used oz light cream cheese ...
1410    this was so good the only thing i would do is ...
1411    quick and easy to make my family asks for it a...
Name: text, Length: 1412, dtype: object

### 3.2 Preprocessing Operations

#### 3.2.1 Tokenizing

In [4]:
from nltk.tokenize import word_tokenize
# nltk.download('punkt')

df.text = [word_tokenize(t) for t in tqdm(df.text)]

100%|██████████| 1412/1412 [00:00<00:00, 8161.88it/s]


#### 3.2.2 Remove StopWords

In [5]:
from nltk.corpus import stopwords
# nltk.download('stopwords')

sw = set(stopwords.words('english'))

def remove_stop(text, sw):
    return [i for i in text if not i in sw]

df.text = [remove_stop(t, sw) for t in tqdm(df.text)]

100%|██████████| 1412/1412 [00:00<00:00, 282353.15it/s]


#### 3.2.3 Lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# nltk.download('omw-1.4')

word_lem = WordNetLemmatizer()

def lemmatize(text, lem):
    return [lem.lemmatize(token) for token in text]

df.text = [lemmatize(t, word_lem) for t in tqdm(df.text)]

100%|██████████| 1412/1412 [00:01<00:00, 1182.49it/s]


In [7]:
df.text = [' '.join(map(str, t)) for t in tqdm(df.text)]
df.text

100%|██████████| 1412/1412 [00:00<00:00, 470925.35it/s]


0       description say maybe cream cheese cream chees...
1       terrible followed recipe exactly tasted fine a...
2       im always amazed people rate recipe state chan...
3       didnt like creamy rich didnt feel good sour cr...
4                even using green chilies much hot u like
                              ...                        
1407    good omitted green chilies dont like lot heat ...
1408    cut calorie fat way using nonfat sour cream su...
1409    wonderful recipe used oz light cream cheese in...
1410    good thing would leave salt use lower sodium b...
1411    quick easy make family asks time dont add gree...
Name: text, Length: 1412, dtype: object

In [8]:
df.to_csv('../data/own_mod.csv')
df.to_parquet('../data/own_mod.parquet')