# Preprocess

First, let's define a function to preprocess frases using stemming and removing stopwords

In [1]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
porter_stemmer = PorterStemmer()
en_stops = set(stopwords.words('english'))

#binary search to improve efficency
def binary_search(e,l,inizio,fine):
  if inizio>fine or e<l[inizio] or e>l[fine-1]:
    return False
  else:
    mezzo=int((inizio+fine)/2)
    m=l[mezzo]
    if e==m:
      return True
    elif e>m:
      return binary_search(e,l,mezzo,fine)
    else:
      return binary_search(e,l,inizio,mezzo)

#removing symbols and adding space after them
def prepreprocess(frase):
  s=''
  syms=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','è','é','à','ò','ì','ù']
  for char in frase:
    if char not in syms:
      s+=' '
    else:
      s+=char
  return s

#preprocess using prepreprocess fase and stemming and removing stopwords
def preprocess(frase,en_words):
  l=[]
  for word in prepreprocess(frase.lower()).split(' '):
    w=word.lower()
    if w not in en_stops and len(w)>2 and binary_search(w,en_words,0,len(en_words)):
        l.append(porter_stemmer.stem(w))
  return repr(' '.join(l))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Let's load now the data frame, saved as a csv file.

In [5]:
import pandas as pd

df=pd.read_csv('../data/reviews_Video_Games_5.csv')

Now we can preprocess the whole data set using previous function. Preprocess is calculated on the concatenation of summary and review text and result is added in a new column ('text').

In [0]:
import csv

# en_words=sorted(open('/content/drive/My Drive/uni/ml/progetto/en_words.txt', 'r').read().lower().split('\n'))

# df['text']=df['summary']+' '+df['reviewText']
# df['text']=df['text'].apply(lambda x:preprocess(str(x.lower()),en_words))

en_words=sorted(open('../data/en_words.txt', 'r').read().lower().split('\n'))
reader = csv.reader(open('../data/reviews_Video_Games_5.csv','rt'))

new_df=[]
tags=next(reader)
for row in reader:
  row[9]=preprocess(str(row[6]+' '+row[4]),en_words)
  new_df.append(row)

df=pd.DataFrame(new_df, columns=tags)

This is only for the first time. The result of previous code is a new data frame that contains a new field where there is the result of preprocessing. In order to don't repeat it another time (preprocess is too slow) we saved it on a new csv file from which we can start the rest of the computation.

In [0]:
df.to_csv('../data/reviews_Video_Games_5.csv', index=False)

Now we add a cleaning phase in which all reviews don't received a sufficient number of votes are discarded. If a review has received more than 10 votes, it's saved in a new data frame

In [0]:
import csv

reader=csv.reader(open('../data/reviews_Video_Games_5.csv','rt'))

new_df=[]
threshold=10

tags=next(reader)

for row in reader:
  totVotes=int(row[3].split(',')[1][:-1])
  if(totVotes>=threshold):
    new_df.append(row)

df=pd.DataFrame(new_df, columns=tags)
df.to_csv('../data/reviews_Video_Games_5-over10.csv', index=False)