<a href="https://colab.research.google.com/github/Dushanthimadhushika3/FP-Growth-Algorithm/blob/main/FP_Growth_Algorithm(Association).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
!pip install pyspark



In [69]:
import pandas as pd
import glob

from pyspark.sql import SparkSession

# Import the FPGrowth API from PySpark
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SQLContext

spark = SparkSession.builder.master("local").appName("Colab")\
        .config('spark.ui.port', '4050').getOrCreate()

In [70]:
csv = glob.glob('/content/drive/MyDrive/L4S1/Big Data/Data/Datasets'+'/*.csv')

# Read the data from the dataset
for f in csv:
  df = pd.read_csv(f)

In [59]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,f.o.x,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,cnd,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,runail,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,freedecor,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5


In [71]:
sc = df[['user_session','brand','user_id','event_type']]
sc = sc.loc[sc['event_type']=='purchase']
sc = sc[['user_session','brand']]
sc = sc.dropna()
print(sc.shape)
sc.head()

sc.to_csv('/content/purchase.csv')

(121481, 2)


In [72]:
session_id_col_name = 'user_session'
product_id_col_name = 'brand'
session_id_col_index = -1
product_id_col_index = -1
data = dict()

with open('/content/purchase.csv') as f:
    lis = [line.rstrip().split(',') for line in f]        
    for i, x in enumerate(lis):              
        if i == 0:
            if type(x) is list and session_id_col_name in x and product_id_col_name in x: # column names
                session_id_col_index = x.index(session_id_col_name)
                product_id_col_index = x.index(product_id_col_name)
            else:
                print('Required columns does not exists in given csv file')
                break
        else:
            if x[session_id_col_index] in data:
                data[x[session_id_col_index]] = data[x[session_id_col_index]] + ',' + x[product_id_col_index]
            else:
                data[x[session_id_col_index]] = x[product_id_col_index]


# Create the pandas DataFrame
dataframe = pd.DataFrame(list(data.items()),columns=[session_id_col_name,product_id_col_name])
dataframe.head()
print(dataframe.shape)


(25240, 2)


In [73]:
# Preprocess the data
records = []
for i in range(0,25240):
  record = []
  for j in range(1,2):
    val = dataframe.values[i, j].split(",")
    for k in val:
      if k not in record:
        record.append(k)
  records.append([i, record])

print(record)
# Create the pandas DataFrame
out = pd.DataFrame(records, columns = ['id', 'items'])
out.head()

['ingarden']


Unnamed: 0,id,items
0,0,"[runail, irisk, jessnail]"
1,1,[nagaraku]
2,2,"[cnd, beautix]"
3,3,"[runail, philips, irisk, ingarden, masura, sev..."
4,4,"[runail, bluesky, jessnail, gehwol]"


In [74]:
# Convert the pandas dataframe into spark dataframe
sqlContext = SQLContext(spark)
sf = spark.createDataFrame(out,["id", "items"])
sf.show()

+---+--------------------+
| id|               items|
+---+--------------------+
|  0|[runail, irisk, j...|
|  1|          [nagaraku]|
|  2|      [cnd, beautix]|
|  3|[runail, philips,...|
|  4|[runail, bluesky,...|
|  5|             [f.o.x]|
|  6|[cnd, orly, kinet...|
|  7|     [runail, irisk]|
|  8|[milv, kapous, es...|
|  9|[runail, roubloff...|
| 10|       [beauty-free]|
| 11|[runail, bpw.styl...|
| 12|[metzger, yoko, r...|
| 13|[runail, milv, ma...|
| 14|    [ingarden, emil]|
| 15|[runail, pnb, iri...|
| 16|[runail, art-visa...|
| 17|[runail, staleks,...|
| 18|[grattol, zinger,...|
| 19|[severina, runail...|
+---+--------------------+
only showing top 20 rows



In [75]:
# Create fpGrowth object by instantiating the FPGrowth constructor with the necessary parameters
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.01)

# Fit the dataframe into the fpGrowth object to get the model prepared
model = fpGrowth.fit(sf)
# Display the frequently occuring items
model.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|               [cnd]| 686|
|             [f.o.x]| 432|
|             [smart]| 333|
|              [rosi]| 370|
|             [domix]|1650|
|    [domix, grattol]| 278|
|   [domix, ingarden]| 266|
|      [domix, irisk]| 696|
|[domix, irisk, ru...| 427|
|     [domix, runail]| 782|
|              [pole]| 985|
|       [pole, irisk]| 377|
|[pole, irisk, run...| 256|
|      [pole, runail]| 476|
|             [dewal]| 307|
|          [nagaraku]| 636|
|   [nagaraku, irisk]| 284|
|              [milv]|1269|
|       [milv, irisk]| 463|
|[milv, irisk, run...| 295|
+--------------------+----+
only showing top 20 rows



In [76]:
# Display the generated association rules
model.associationRules.show()

+-------------------+-----------+-------------------+------------------+--------------------+
|         antecedent| consequent|         confidence|              lift|             support|
+-------------------+-----------+-------------------+------------------+--------------------+
| [severina, runail]|    [irisk]|  0.569371727748691|2.3059920424224907|0.017234548335974643|
|             [pole]|    [irisk]|0.38274111675126904|1.5501260890247162|0.014936608557844691|
|             [pole]|   [runail]|  0.483248730964467|1.5261759221150084|0.018858954041204436|
|         [nagaraku]|    [irisk]|0.44654088050314467|1.8085192271982304|0.011251980982567354|
|      [milv, irisk]|   [runail]| 0.6371490280777538|2.0122174009862994|0.011687797147385102|
|           [zinger]|    [irisk]| 0.3663461538461538|1.4837254369507258|0.015095087163232964|
|           [zinger]|   [runail]| 0.4307692307692308|1.3604373604373605| 0.01774960380348653|
|[ingarden, grattol]|    [irisk]| 0.4722222222222222|1.91253

In [77]:
# Predicts what the customer is most likely to buy with the items bought in the dataset
model.transform(sf).show(20)

+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[runail, irisk, j...|[domix, pole, mil...|
|  1|          [nagaraku]|             [irisk]|
|  2|      [cnd, beautix]|                  []|
|  3|[runail, philips,...|[domix, pole, lia...|
|  4|[runail, bluesky,...|[irisk, domix, po...|
|  5|             [f.o.x]|                  []|
|  6|[cnd, orly, kinet...|                  []|
|  7|     [runail, irisk]|[domix, pole, mil...|
|  8|[milv, kapous, es...|[runail, concept,...|
|  9|[runail, roubloff...|[irisk, bpw.style...|
| 10|       [beauty-free]|                  []|
| 11|[runail, bpw.styl...|[domix, pole, lia...|
| 12|[metzger, yoko, r...|[domix, pole, mil...|
| 13|[runail, milv, ma...|[domix, pole, lia...|
| 14|    [ingarden, emil]|[domix, masura, g...|
| 15|[runail, pnb, iri...|[domix, pole, mil...|
| 16|[runail, art-visa...|[domix, pole, mil...|
| 17|[runail, staleks,...|[domix, pole, 