<a href="https://colab.research.google.com/github/ElenaSerbuValentina/Apriori_spark/blob/main/Apriori_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PYSPARK SETUP


In [4]:
# Download Java Virtual Machine (JVM)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Download Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
# Unzip the file
!tar xf spark-3.4.0-bin-hadoop3.tgz

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.4.0-bin-hadoop3'

In [6]:
# Install library for finding Spark
!pip install -q findspark
# Import the libary
import findspark
# Initiate findspark
findspark.init()
# Check the location for Spark
findspark.find()

'/content/spark-3.4.0-bin-hadoop3'

In [7]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
conf = SparkConf().setAppName('test_mba')
spark = SparkSession.builder.enableHiveSupport().config(conf=conf).getOrCreate()
#spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

# DATA IMPORT

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
import os

In [None]:
#insert your kaggle keys
os.environ['KAGGLE_USERNAME'] = "eluska"
os.environ['KAGGLE_KEY'] = "91ee9b34765d3cbbbe802cf7b1bd16cb"

In [None]:
!kaggle datasets download -d xhlulu/medal-emnlp --unzip

Downloading medal-emnlp.zip to /content
100% 6.82G/6.82G [01:28<00:00, 124MB/s]
100% 6.82G/6.82G [01:28<00:00, 82.7MB/s]


# IMPORTS

In [None]:
!git clone https://github.com/ElenaSerbuValentina/Apriori_spark.git

Cloning into 'Apriori_spark'...
remote: Enumerating objects: 46, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 46 (delta 16), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (46/46), 235.66 KiB | 7.36 MiB/s, done.
Resolving deltas: 100% (16/16), done.


In [8]:
import itertools
from itertools import combinations, product
from pyspark.sql import DataFrame
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

In [9]:
from Apriori_spark.utils.functions import Preprocess, apriori

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# DATA PREPARATION

In [None]:
#read data
df = spark.read.csv('/content/full_data.csv', header=True, inferSchema=True)

In [None]:
#data sampling
df = df.sample(fraction=0.0001,seed= 143)

In [None]:
#takes conversation column from rdd
rdd = df.rdd.map(lambda x:x[0])

In [None]:
rdd.take(1)

['nonoccluded virions of a nuclear polyhedrosis virus of the alfalfa looper autographa californica found in the medium of cell cultures of INF fall armyworm spodopter frugiperda and in the hemolymph of infected s frugiperda larvae were partially characterized by biological chemical and physical methods also the rate of appearance of the virions was studied in cell culture and the host insect to determine maximum virion production virions obtained from both sources were heatsensitive acidlabile and inactivated by several organic solvents the nonoccluded virions found in the insect cell culture fluid and in the hemolymph were identical and both were enveloped nucleocapsids visualization of the fragilely enveloped nucleocapsid was accomplished only after fixation with glutaraldehyde differences between the nonoccluded and occluded virions of nuclear polyhedrosis viruses are discussed']

In [None]:
#define total numer of transactions
total_count = rdd.count()
total_count

14539

In [None]:
#preprocessing rdd (INITIAL STAGE)
rdd_preprocessed = rdd.map(Preprocess.preprocess)

In [None]:
#encode rdd and get dictionary for decoding the encoded results
#initialize class for running Apriori preprocessing
processor = Preprocess()
#hash the items to integers in order to improve algorithm's running time
rdd_encoded, num2word = processor.encoder(rdd_preprocessed)

#initialize apriori class with minimum support, maximum size of baskets , minimum confidence to generate association rules and the vocabulary
#that maps numbers to words in order to decode the final frequent itemsets
APRIORI = apriori(minSupportpercent =0.1,total_transactions=total_count, maximumBasketSize=3, min_confidence = 0.5, vocab= num2word)

In [None]:
#in order to improve performance
rdd = rdd_encoded
rdd.cache()

PythonRDD[23] at RDD at PythonRDD.scala:53

# FIND SINGLETONS

In [None]:
minSupport = total_count*0.1
singletons = rdd.flatMap(list)\
                 .map(lambda word: (word,1))\
                 .reduceByKey(lambda y,x: x+y)\
                 .filter(lambda t: t[1]>= minSupport)

In [None]:
words_singleton = singletons.take(20)
words_singleton = [(num2word[t[0]],t[1]) for t in words_singleton]

In [None]:
words_singleton

[('treatment', 2349),
 ('also', 2975),
 ('used', 2453),
 ('high', 1812),
 ('using', 2763),
 ('study', 4466),
 ('c', 1761),
 ('t0', 1828),
 ('found', 2175),
 ('case', 1602),
 ('time', 1759),
 ('result', 3836),
 ('human', 1731),
 ('showed', 1852),
 ('two', 2546),
 ('mechanism', 1539),
 ('however', 2415),
 ('suggest', 1456),
 ('increased', 1977),
 ('observed', 1650),
 ('role', 1790),
 ('present', 2002),
 ('control', 1755),
 ('protein', 2033),
 ('increase', 1539),
 ('activity', 2012),
 ('associated', 1836),
 ('p', 1672),
 ('effect', 3210),
 ('significantly', 1682),
 ('disease', 1937),
 ('well', 1666),
 ('analysis', 2082),
 ('patient', 3569),
 ('group', 1593),
 ('response', 1654),
 ('method', 1530),
 ('data', 1934),
 ('system', 1687),
 ('factor', 1584),
 ('clinical', 1626),
 ('cell', 3346),
 ('compared', 1798),
 ('level', 2054),
 ('change', 1656),
 ('one', 2226),
 ('different', 1982),
 ('significant', 1790),
 ('may', 2828)]

# FIND FREQUENT PAIRS

In [None]:
singletons = singletons.map(lambda x: (x[0]))
candidates1 = list(combinations(singletons.toLocalIterator(),2))

#filtering phase to select real frequent pairs
combined_2 = rdd.flatMap(lambda sentence: [(tuple(candidate),1) for candidate in candidates1 if set(list(candidate)).issubset(set(sentence))])\
                .reduceByKey(lambda y,x:x+y)\
                .filter(lambda item : item[1]>= 100)

In [None]:
words_2 = combined_2.take(20)
words_2 = [(num2word[t[0]],t[1]) if type(t[0])==int else (tuple([num2word[element] for element in t[0]]),t[1]) for t in words_2]

In [None]:
words_2

[(('also', 'effect'), 798),
 (('result', 'increased'), 691),
 (('however', 'may'), 648),
 (('study', 'analysis'), 783),
 (('effect', 'patient'), 595),
 (('study', 'well'), 596),
 (('using', 'result'), 956),
 (('effect', 'significantly'), 584),
 (('increased', 'cell'), 627),
 (('using', 'two'), 590),
 (('result', 'observed'), 605),
 (('study', 'may'), 1000),
 (('effect', 'different'), 534),
 (('cell', 'different'), 523),
 (('study', 'however'), 940),
 (('however', 'effect'), 665),
 (('study', 'activity'), 665),
 (('study', 'increased'), 733),
 (('observed', 'cell'), 543),
 (('study', 'significantly'), 735),
 (('effect', 'compared'), 573),
 (('level', 'may'), 510),
 (('found', 'may'), 530),
 (('study', 'effect'), 1248),
 (('result', 'activity'), 683),
 (('observed', 'effect'), 539),
 (('used', 'may'), 515),
 (('increased', 'increase'), 548),
 (('protein', 'cell'), 948),
 (('using', 'study'), 1054),
 (('case', 'patient'), 735),
 (('also', 'cell'), 979),
 (('increase', 'effect'), 612),
 ((

# FIND FREQUENT TRIPLETS

In [None]:
#compute candidates for frequent triplets
k=3
candidates = APRIORI.getCombinations(combined_2,k)

In [None]:
combined_3 = rdd.flatMap(lambda sentence: [(tuple(candidate),1) for candidate in candidates if set(list(candidate)).issubset(set(sentence))])\
                .reduceByKey(lambda y,x:x+y)\
                .filter(lambda item : item[1]> 10)

In [None]:
word_3 = combined_3.take(20)
word_3 = [(num2word[t[0]],t[1]) if type(t[0])==int else (tuple([num2word[element] for element in t[0]]),t[1]) for t in word_3]

In [None]:
word_3

[(('change', 'using', 'analysis'), 80),
 (('change', 'mechanism', 'result'), 91),
 (('patient', 'disease', 'may'), 238),
 (('found', 'cell', 'disease'), 103),
 (('found', 'patient', 'clinical'), 111),
 (('human', 'cell', 'response'), 148),
 (('suggest', 'study', 'result'), 321),
 (('effect', 'control', 'however'), 116),
 (('c', 'increased', 'cell'), 121),
 (('change', 'increased', 'also'), 116),
 (('c', 'associated', 'study'), 74),
 (('also', 'clinical', 'treatment'), 113),
 (('analysis', 'study', 'however'), 187),
 (('two', 'using', 'effect'), 140),
 (('activity', 'study', 'however'), 155),
 (('two', 'change', 'also'), 94),
 (('one', 'study', 'data'), 152),
 (('activity', 'using', 'however'), 90),
 (('associated', 'activity', 'effect'), 88),
 (('analysis', 'showed', 'result'), 190),
 (('found', 'patient', 'result'), 153),
 (('also', 'increase', 'treatment'), 106),
 (('change', 'cell', 'showed'), 90),
 (('activity', 'patient', 'treatment'), 88),
 (('activity', 'cell', 'result'), 294),


# GENERALIZED APRIORI ALGORITHM

In [None]:
#run the generalized Apriori algorithm on my processed rdd
freq_rdd =APRIORI.Apriori(support = sc.parallelize([]), rdd=rdd)

---set up complete---
---singletones found!---
---first candidates found!---
starting 2 items in basket loop
added frequent baskets with 2 items
computing candidates for next iteration
found candidates for 3 items in basket
starting 3 items in basket loop
added frequent baskets with 3 items
computing candidates for next iteration
found candidates for 4 items in basket
starting 4 items in basket loop
added frequent baskets with 4 items


In [None]:
results = freq_rdd.collect()

In [None]:
#decode results which are hashed to integers
decoded = [(num2word[t[0]],t[1]) if type(t[0])==int else (tuple([num2word[element] for element in t[0]]),t[1]) for t in results]

In [None]:
decoded

[('treatment', 2349),
 ('also', 2975),
 ('used', 2453),
 ('high', 1812),
 ('using', 2763),
 ('study', 4466),
 ('c', 1761),
 ('t0', 1828),
 ('found', 2175),
 ('case', 1602),
 ('time', 1759),
 ('result', 3836),
 ('human', 1731),
 ('showed', 1852),
 ('two', 2546),
 ('mechanism', 1539),
 ('however', 2415),
 ('suggest', 1456),
 ('increased', 1977),
 ('observed', 1650),
 ('role', 1790),
 ('present', 2002),
 ('control', 1755),
 ('protein', 2033),
 ('increase', 1539),
 ('activity', 2012),
 ('associated', 1836),
 ('p', 1672),
 ('effect', 3210),
 ('significantly', 1682),
 ('disease', 1937),
 ('well', 1666),
 ('analysis', 2082),
 ('patient', 3569),
 ('group', 1593),
 ('response', 1654),
 ('method', 1530),
 ('data', 1934),
 ('system', 1687),
 ('factor', 1584),
 ('clinical', 1626),
 ('cell', 3346),
 ('compared', 1798),
 ('level', 2054),
 ('change', 1656),
 ('one', 2226),
 ('different', 1982),
 ('significant', 1790),
 ('may', 2828),
 (('using', 'significantly'), 445),
 (('also', 'effect'), 798),
 ((

# ASSOCIATION RULES

In [None]:
#generate association rules
rules_df = APRIORI.generate_association_rules(results, to_decode=True)

In [None]:
rules_df.head(20)

Unnamed: 0,antecedent,consequent,support,confidence,interest
0,"(human, protein, response)","(cell,)",0.00392,0.838235,0.608096
1,"(human, protein, role)","(cell,)",0.005434,0.759615,0.529476
2,"(associated, human, protein)","(cell,)",0.003508,0.75,0.51986
3,"(increased, present, using)","(study,)",0.003301,0.738462,0.431288
4,"(mechanism, protein, treatment)","(cell,)",0.003095,0.737705,0.507565
5,"(factor, human, protein)","(cell,)",0.004471,0.730337,0.500197
6,"(activity, human, increase)","(cell,)",0.003164,0.730159,0.500019
7,"(activity, human, response)","(cell,)",0.002889,0.724138,0.493998
8,"(human, protein, showed)","(cell,)",0.004058,0.719512,0.489373
9,"(factor, human, role)","(cell,)",0.003645,0.716216,0.486077


In [None]:
# save rules to csv
rules_df.to_csv('/content/rules.csv')