### Most Frequent Words System
Program that reads the dataset, preprocess the data and output the most frequent words in a subset of the dataset.

With the most frequent words we can build a description for a user to input to the recommendation system.

In [1]:
import json
from collections import defaultdict
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np
import ipywidgets as widgets
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import os
from preprocess_data import pre_process_for_description

# set stopwords vocabulary
nltk.download('stopwords')

# set tokenizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [445]:
### load the meta data
data = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

# with gzip.open('Dataset/meta_CDs_and_Vinyl.json.gz') as f:
#     for l in f:
#         data.append(json.loads(l.strip()))

In [446]:
# convert list into pandas dataframe
df2 = pd.DataFrame.from_dict(data)

# set size of display in pandas
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 20 )

# first row of the list
print("Columns of the dataset: ", df2.columns)

# show dataframe with columns and rows


Columns of the dataset:  Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')


In [447]:
# Drop rows with no description (empty list)
df2 = df2[df2['description'].map(lambda d: len(d)) > 0]


In [448]:

pd.set_option('display.max_rows', 20)
# each description is a list of strings, and we want to merge them into a single string and remove the empty strings
df2.description = df2.description.apply(lambda x: [string for string in x if string != ""])
# join the strings in the list into a single string
df2.description = df2.description.apply(lambda x: " ".join(x))

df2.iloc[0].description


'<b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids and singing along with simple melodies. This charming DVD contains 16 music videos featuring kids engaged in fun activities, from visiting animals at the zoo to comparing clothing sizes in grandmas closet. Each video features an original song of authentic Latin rhythms that gets kids singing along with the children on screen. As they watch, listen, and sing along, kids absorb 300 Spanish words, each of which is shouted out in a song and displayed as a subtitle on screen.'

In [450]:
def random_range(window_size, max_range):
    a = np.random.randint(0, max_range - window_size)
    b = a + window_size - 1
    return a, b

print("Random range given the input numbers: ", random_range(10000, 23195))


Random range given the input numbers:  (11967, 21966)


In [451]:
# compute average length of description
# avr_for_row = df2.description.str.len()
# avg_len = avr_for_row.mean()
# print(avr_for_row)
# print("avg ", int(avg_len))
# print("max ", avr_for_row.max())
# print("min ",avr_for_row.min())
# null_string = df2.description.apply(lambda x: x == "")


In [452]:
# df3 = df2[:20]
df3 = df2
print("Number of descriptions in the dataset:", len(df3.description))
# words_array = np.array([])
words_array = []
# apply pre-processing to all descriptions
description_preprocessed = df3.description.apply(lambda x: pre_process_for_description(x))
description_preprocessed = description_preprocessed.to_frame()
print(len)
# rnd = random_range(4, len(description_preprocessed.description))
# print(rnd[0], rnd[1])
rnd = random_range(1000, len(description_preprocessed.description))
# print(rnd)
print("Take the descriptions from the", rnd[0], "th description to the", rnd[1], "th description")
for elem in description_preprocessed[rnd[0]:rnd[1]].description:
    if len(elem) > 0:
        words_array.append(elem)

print("Array of transactions: ", words_array)
print("Array of transaction is stored in the file ./transactions.txt")
# write all transactions of the descriptions into a file
if os.path.exists("transactions.txt"):
    os.remove("transactions.txt")
file = open("transactions.txt", "w")
file.write(str(words_array))
file.write("\n")

Number of descriptions in the dataset: 23195
<built-in function len>
Take the descriptions from the 6665 th description to the 7664 th description
Array of transaction is stored in the file ./transactions.txt


1

In [453]:
if len(words_array) == 0:
    print("No transactions to be written")

print(words_array)



In [454]:
te = TransactionEncoder()
te_ary = te.fit(words_array).transform(words_array)
dff = pd.DataFrame(te_ary, columns=te.columns_)
dff

Unnamed: 0,aaa,aac,aage,abandoned,abbreviations,abc,abcs,abilities,ability,able,...,zoo,zooka,zoology,zoom,zoomable,zoombini,zoombinis,zooming,zulu,zurg
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
994,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [457]:
frequent_items = apriori(dff, min_support=0.2, use_colnames=True)
# create column with length of itemsets
frequent_items['length'] = frequent_items['itemsets'].apply(lambda x: len(x))
print(frequent_items)

# keep the frequent items with length 1
df_itemset = frequent_items[frequent_items['length'] == 1] #should we put frequent_items['support'] > 0.2?
df_itemset_more = frequent_items[frequent_items['length'] > 1] #should we put frequent_items['support'] > 0.2?

# convert the itemsets into a list of words
list_itemsets = df_itemset.itemsets.tolist()
list_itemsets_more = df_itemset_more.itemsets.tolist()

list_itemsets_more = [set(frozen_set) for frozen_set in list_itemsets_more]
list_itemsets = [item for sublist in list_itemsets for item in sublist]

# create a string of the frequent items
string_itemsets = ", ".join(list_itemsets)
string_itemsets_more = ", ".join(str(x) for x in list_itemsets_more)

print(string_itemsets)
print(string_itemsets_more)

# write the frequent items into a file
if os.path.exists("frequent_items.txt"):
    os.remove("frequent_items.txt")
file = open("frequent_items.txt", "w")
file.write(string_itemsets + " ")
file.write(string_itemsets_more)
file.close()


     support         itemsets  length
0   0.378758           (also)       1
1   0.281563             (cd)       1
2   0.218437       (complete)       1
3   0.350701         (create)       1
4   0.371743           (easy)       1
..       ...              ...     ...
21  0.215431            (web)       1
22  0.209419      (use, also)       2
23  0.241483      (use, easy)       2
24  0.202405  (new, features)       2
25  0.214429  (use, features)       2

[26 rows x 3 columns]
also, cd, complete, create, easy, even, features, get, help, includes, including, make, need, new, one, program, software, time, tools, use, using, web
{'use', 'also'}, {'use', 'easy'}, {'new', 'features'}, {'use', 'features'}
