# Create Data Set by TF-IDF model

We now need to create the vocabulary and start the counting process. We can use the CountVectorizer to create a vocabulary from all the text in our df['text'] followed by the counts of words in the vocabulary.

In [1]:
import pandas as pd

df=pd.read_csv('../data/reviews_Video_Games_5-over10.csv')

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
  """load stop words """    
  with open(stop_file_path, 'r', encoding="utf-8") as f:
    stopwords = f.readlines()
    stop_set = set(m.strip() for m in stopwords)
  return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words('../data/stopwords.txt')

#get the text column 
docs=df['text'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 99,999999% of documents, 
#eliminate stop words
cv=CountVectorizer(stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


Its now time to compute the IDF values.

In [0]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf = TfidfVectorizer()
# features = tfidf.fit_transform(df['text'].tolist())

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

We are now ready to compute TF-IDF and then extract top keywords from the TF-IDF vectors. First, let's separate data set from test set to extract top keyword

In [0]:
import csv
import pandas as pd
import random

ds=[]
ts=[]

reader = csv.reader(open('../data/reviews_Video_Games_5-over10.csv','rt'))

tags=next(reader)
for row in reader:
  if random.randint(1,40)>1:
    ds.append(row)
  else:
    ts.append(row)
    
dataset=pd.DataFrame(ds, columns=tags)
testset=pd.DataFrame(ts, columns=tags)

# get test docs into a list
docs_test=testset['text'].tolist()

The next step is to compute the tf-idf value for a given document in our test set that generates a vector of tf-idf scores. Next, we sort the words in the vector in descending order of tf-idf values and then iterate over to extract the top-n keywords.We are extracting keywords for the first document in our test set.

In [0]:
def sort_coo(coo_matrix):
  tuples = zip(coo_matrix.col, coo_matrix.data)
  return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
  """get the feature names and tf-idf score of top n items"""
  #use only topn items from vector
  sorted_items = sorted_items[:topn] 
  score_vals = []
  feature_vals = []
  # word index and corresponding tf-idf score
  for idx, score in sorted_items:
    #keep track of feature name and its corresponding score
    score_vals.append(round(score, 3))
    feature_vals.append(feature_names[idx])
  #create a tuples of feature,score
  #results = zip(feature_vals,score_vals)
  results= {}
  for idx in range(len(feature_vals)):
    results[feature_vals[idx]]=score_vals[idx]    
  return results

In [0]:
# dizionario={}
# for e in cv.get_feature_names():
#   dizionario[e]=0.0

Now we are ready to compute tf-idf values for words in each row. We create a list which each element is a dictionary that contains the word as a key and the values is the tf-idf value of that word. Each dictionary is reffered to a single review.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_names=cv.get_feature_names()

i=0
dicts=[]
for doc in df['text'].tolist():
  if(i%10000==0):
    print(i)
  #generate tf-idf for the given document
  tf_idf_vector=tfidf_transformer.transform(cv.transform([doc])) 
  #sort the tf-idf vectors by descending order of scores
  sorted_items=sort_coo(tf_idf_vector.tocoo()) 
  #extract all items from document
  keywords=extract_topn_from_vector(feature_names,sorted_items,len(doc))
  d={}
  for e in cv.get_feature_names():
    if e in keywords:
      d[e]=keywords[e]
  dicts.append(d)
  i+=1

0
10000
20000
30000


In [8]:
len(cv.get_feature_names())

25895

In order to don't create a data frame with 35681 attributes, we take only the top 100 words most used in all reviews. 

In [0]:
def min_list(l):
  m=l[0]
  for e in l:
    if e<m:
      m=e
  return m

def remove_value(d,v):
  keys=d.keys()
  for e in keys:
    if d[e]==v:
      del d[e]
      break
      
keycount={}
for e in cv.get_feature_names():
  keycount[e]=0
for d in dicts:
  keys=d.keys()
  for k in keys:
    keycount[k]+=1

minv=0
top100={}
for k in keycount.keys():
  if len(top100)<100:
    top100[k]=keycount[k]
    minv=min_list(list(top100.values()))
  else:
    if keycount[k]>minv:
      remove_value(top100,minv)
      top100[k]=keycount[k]
      minv=min_list(list(top100.values()))

In [0]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # tfidf = TfidfVectorizer()
# # features = tfidf.fit_transform(df['text'].tolist())

# # you only needs to do this once, this is a mapping of index to 
# feature_names=cv.get_feature_names()
 
# # get the document that we want to extract keywords from
# r=random.randint(0,len(ts))
# doc=docs_test[r]
 
# #generate tf-idf for the given document
# tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
 
# #sort the tf-idf vectors by descending order of scores
# sorted_items=sort_coo(tf_idf_vector.tocoo())
 
# #extract only the top n; n here is 10
# keywords=extract_topn_from_vector(feature_names,sorted_items,len(doc))
 
# # now print the results
# print("\n=====Doc=====")
# print(r,len(doc.split(' ')),doc)
# print("\n===Keywords===")
# for k in keywords:
#   print(k,keywords[k])
# print(len(keywords))

We are now ready to create the data frame which: first column is the review text, the second one is the utility (calculated as the ratio between usefull rates and total rate and is 1 if this ratio is more equal than 0.7, 0 otherwise), the other columns are tf-idf value of each word in the columns in each review (if the word isn't in the review text value is 0).

In [0]:
import csv
import pandas as pd

reader = csv.reader(open('../data/reviews_Video_Games_5-over10.csv','rt'))

dataframe=[]
tags=['text','utility']

for e in list(top100.keys()):
  tags.append(e)

def utility(a,b):
  if b==0.0: return 0.0
  return a/b

i=0
next(reader)
for row in reader:
  r=[]
  r.append(row[9])
  u=utility(float(eval(row[3])[0]),float(eval(row[3])[1]))
  if u>=0.7:
    r.append(1)
  else:
    r.append(0)
  for e in top100.keys():
    if e in dicts[i].keys():
      r.append(dicts[i][e])
    else:
      r.append(0)
  dataframe.append(r)
  i+=1

This is only for the first time we create the data frame.

In [0]:
DF=pd.DataFrame(dataframe, columns=tags)
DF.to_csv('../data/reviews_Video_Games_5-over10-data_frame.csv', index=False)

In [6]:
# import pandas as pd
# DF=pd.read_csv('../data/reviews_Video_Games_5-over10-data_frame.csv')
DF.head()

Unnamed: 0,text,utility,actual,anoth,back,bad,best,big,bit,buy,...,time,tri,turn,version,way,well,work,world,worth,year
0,'pay unlock content think instal game struggl ...,0,0.0,0.052,0.0,0.0,0.0,0.0,0.0,0.091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,'awesom game crash frequent got version instea...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046,...,0.069,0.047,0.029,0.147,0.0,0.02,0.024,0.0,0.028,0.052
2,'step dirt terrif love play dirt thought graph...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.049,0.0,0.0,0.0,0.0
3,'fun pretti fun game buy car track onlin store...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138,...,0.052,0.0,0.0,0.0,0.064,0.0,0.0,0.0,0.0,0.0
4,'best graphic game far must gamer realli hit h...,1,0.0,0.0,0.0,0.0,0.105,0.0,0.0,0.148,...,0.0,0.0,0.0,0.0,0.0,0.044,0.0,0.0,0.0,0.0
