In [None]:
%matplotlib inline

In [None]:
# !pip uninstall transformers
!pip install transformers==3.5



In [None]:
import logging
import time
from platform import python_version
import random
from tqdm import tqdm
import re

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from sklearn.metrics import roc_auc_score
from torch.autograd import Variable

In [None]:
random_seed = 42

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
# torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
# torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [None]:
print("python version==%s" % python_version())
print("pandas==%s" % pd.__version__)
print("numpy==%s" % np.__version__)
print("torch==%s" % torch.__version__)
print("sklearn==%s" % sklearn.__version__)
print("transformers==%s" % transformers.__version__)
print("matplotlib==%s" % matplotlib.__version__)

python version==3.6.9
pandas==1.1.5
numpy==1.19.5
torch==1.7.0+cu101
sklearn==0.22.2.post1
transformers==3.5.0
matplotlib==3.2.2


In [None]:
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# df = pd.read_csv('drive/MyDrive/train.csv')
df = pd.read_csv('drive/MyDrive/reviews.csv')
df.shape

(358957, 6)

In [None]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,923,0,3.0,-1,2014-12-08,The food at snack is a selection of popular Gr...
1,924,0,3.0,-1,2013-05-16,This little place in Soho is wonderful. I had ...
2,925,0,4.0,-1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,926,0,4.0,-1,2011-07-28,This is a beautiful quaint little restaurant o...
4,927,0,4.0,-1,2010-11-01,Snack is great place for a casual sit down lu...


In [None]:
def make_id_dict(df):
    user_id_dict = {}
    prod_id_dict = {}

    for idx in tqdm(range(df.shape[0])):
        if df.user_id[idx] in user_id_dict:
            user_id_dict[df.user_id[idx]] += 1
        else:
            user_id_dict[df.user_id[idx]] = 1

        if df.prod_id[idx] in prod_id_dict:
            prod_id_dict[df.prod_id[idx]] += 1        
        else:
            prod_id_dict[df.prod_id[idx]] = 1
        
    return user_id_dict, prod_id_dict

In [None]:
user_id_dict, prod_id_dict = make_id_dict(df)

100%|██████████| 358957/358957 [00:13<00:00, 26099.72it/s]


In [None]:
user_id_count = []
prod_id_count = []

for idx in tqdm(range(df.shape[0])):
    user_id_count.append(user_id_dict[df.user_id[idx]])
    prod_id_count.append(prod_id_dict[df.prod_id[idx]])

100%|██████████| 358957/358957 [00:07<00:00, 51204.27it/s]


In [None]:
df['user_id_count'] = user_id_count
df['prod_id_count'] = prod_id_count

In [None]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review,user_id_count,prod_id_count
0,923,0,3.0,-1,2014-12-08,The food at snack is a selection of popular Gr...,39,210
1,924,0,3.0,-1,2013-05-16,This little place in Soho is wonderful. I had ...,1,210
2,925,0,4.0,-1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,2,210
3,926,0,4.0,-1,2011-07-28,This is a beautiful quaint little restaurant o...,1,210
4,927,0,4.0,-1,2010-11-01,Snack is great place for a casual sit down lu...,5,210


In [None]:
new_label_lst = []
for ele in df.label:
    if ele == -1:
        new_label_lst.append(0)
    else:
        new_label_lst.append(1)

In [None]:
df.drop(['label'], axis=1)
df['label'] = new_label_lst

In [None]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review,user_id_count,prod_id_count
0,923,0,3.0,0,2014-12-08,The food at snack is a selection of popular Gr...,39,210
1,924,0,3.0,0,2013-05-16,This little place in Soho is wonderful. I had ...,1,210
2,925,0,4.0,0,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,2,210
3,926,0,4.0,0,2011-07-28,This is a beautiful quaint little restaurant o...,1,210
4,927,0,4.0,0,2010-11-01,Snack is great place for a casual sit down lu...,5,210


In [None]:
df = df.sample(frac=1)
df = df.reset_index(drop=True)

In [None]:
df_train = df[:10000].reset_index(drop=True)
df_val = df[10000:11000].reset_index(drop=True)
df_test = df[11000:13000].reset_index(drop=True)

In [None]:
df_train.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review,user_id_count,prod_id_count
0,58479,251,4.0,1,2010-05-01,"Went on a Friday night at about 5:30 PM, it wa...",3,1834
1,39714,202,4.0,1,2014-10-26,"Nice ambiance, very nice staff and good food. ...",2,683
2,101723,523,5.0,1,2013-11-17,we went to the city for a long weekend and rea...,1,414
3,33349,100,5.0,1,2010-07-12,Foods tasted: Walnut french toast Blueberry pa...,2,2677
4,23432,305,5.0,1,2014-12-16,LOVE the new space. It wasn't too overly packe...,14,203


In [None]:
sentence = df_train.review[0].lower()
print(sentence)
sentence = re.sub('[\d]+', 'NUMBER', sentence)
sentence = re.sub('[^\dA-Za-z\s]+', '', sentence)
sentence = re.sub('[NUMBER]+', 'NUMBER', sentence)
print(sentence)

# print(sentence.split())

went on a friday night at about 5:30 pm, it was still early and plenty of seats were available.  its a very cool nondescript vibe in here, i liked it though.  for drinks i had the penicillin and the pomegranate sour, the former was tart and sweet. it was the better of the two.  husband had the pickle juice martini and that was really good. you really get like a nice mild pickle flavor as the drink finishes in your mouth. for food we just had a couple of appetizers: the hamachi with wasabi cream and soybeans and the pork buns.  both were delicious especially the pork buns.  i doused them with a liberal amount of sircacha and it did not make it incredibly spicy. it just added to the richness of the flavors.  very enjoyable cool place and i am looking forward to coming back here.
went on a friday night at about NUMBER pm it was still early and plenty of seats were available  its a very cool nondescript vibe in here i liked it though  for drinks i had the penicillin and the pomegranate sou

In [None]:
def make_word_dict(df):
    real_review_dict = dict()
    fake_review_dict = dict()

    for i in tqdm(range(df.shape[0])):
        # word_lst = df.review[i].lower().split()
        
        sentence = df.review[i].lower()
        sentence = re.sub('[\d]+', 'NUMBER', sentence)
        sentence = re.sub('[^A-Za-z\s]', '', sentence)
        sentence = re.sub('[NUMBER]+', 'NUMBER', sentence)
        word_lst = sentence.split()

        if df.label[i] == 1:
            for ele in word_lst:
                if ele in real_review_dict:
                    real_review_dict[ele] += 1
                else:
                    real_review_dict[ele] = 1

        else:
            for ele in word_lst:
                if ele in fake_review_dict:
                    fake_review_dict[ele] += 1
                else:
                    fake_review_dict[ele] = 1

    return real_review_dict, fake_review_dict

In [None]:
real_review_dict, fake_review_dict = make_word_dict(df_train)

100%|██████████| 10000/10000 [00:01<00:00, 9236.81it/s]


In [None]:
def make_del_word_lst(threshold, fake_review_dict, real_review_dict):
    del_word_lst = []
    sum_real_review = sum(real_review_dict.values())
    sum_fake_review = sum(fake_review_dict.values())

    for ele in fake_review_dict:
        if ele in real_review_dict:
            real_cnt = real_review_dict[ele] / sum_real_review
            fake_cnt = fake_review_dict[ele] / sum_fake_review

            if real_cnt < fake_cnt:
                real_cnt, fake_cnt = (fake_cnt, real_cnt)
      
            if (fake_cnt / real_cnt) >= threshold:
                del_word_lst.append(ele)

    return del_word_lst

In [None]:
del_word_lst = make_del_word_lst(threshold=0.9, fake_review_dict, real_review_dict)

In [None]:
len(del_word_lst)

699

In [None]:
def make_review_lst(df, del_word_lst):
  new_review_lst = []
  for sentence in tqdm(df.review):
    sentence = sentence.lower()
    
    sentence = re.sub('[\d]+', '<NUM>', sentence)
    sentence = re.sub('[^A-Za-z\s]', '', sentence)
    sentence = re.sub('[<NUM>]+', '<NUM>', sentence)
    new_review_lst.append(''.join(map(lambda x: x + ' ' if x not in del_word_lst else '', sentence.split())))
    
  return new_review_lst

In [None]:
train_review_lst = make_review_lst(df_train, del_word_lst)
val_review_lst = make_review_lst(df_val, del_word_lst)
test_review_lst = make_review_lst(df_test, del_word_lst)

100%|██████████| 10000/10000 [00:08<00:00, 1243.48it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1273.03it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1241.26it/s]


In [None]:
# 필요 없다고 생각되는 col들 삭제
df_train = df_train.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)
df_val = df_val.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)
df_test = df_test.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)

In [None]:
df_train['review'] = train_review_lst
df_val['review'] = val_review_lst
df_test['review'] = test_review_lst

In [None]:
df_train.head()

Unnamed: 0,rating,label,user_id_count,prod_id_count,review
0,4.0,1,3,1834,went on friday night <NUM> pm plenty seats wer...
1,4.0,1,2,683,nice ambiance very nice staff food place to br...
2,5.0,1,1,414,went to city long weekend read place on yelp j...
3,5.0,1,2,2677,tasted walnut french toast blueberry pancakes ...
4,5.0,1,14,203,love new wasnt too overly when came weekly gri...


In [None]:
# df.comment_text[0]
df_train.review[0]

'went on friday night <NUM> pm plenty seats were its very cool nondescript vibe here liked though drinks penicillin pomegranate sour tart sweet husband pickle juice martini nice mild pickle flavor as drink finishes your mouth food just couple appetizers hamachi with wasabi cream soybeans pork buns were delicious especially pork buns doused with liberal sircacha did make spicy just added to richness flavors very enjoyable cool place am looking forward to coming back here '