In [11]:
# association rules for imdb movie review data
import os

DATASET = "DATASET/IMDB Dataset.csv"

if not os.path.exists(DATASET):
    print("Dataset not found")
    print("Download the dataset from https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews and place it in the DATASET folder")
    exit(1)

import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

nltk.download('stopwords')
nltk.download('punkt')

data = pd.read_csv(DATASET)
data.head()

# print data info
print(data.info())



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ben\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ben\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None


In [12]:
# preprocessing

stop_words = set(stopwords.words('english'))

def clean_text(text):
    # to lower case
    text = text.lower()
    # remove anything that is not a letter
    text = ''.join([w for w in text if w.isalpha() or w == ' '])

    # remove stopwords
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w in stop_words]

    # only take first 10 and last 10 words
    # the idea is to keep the most important words. The beginning and end of the review is usually the most important
    # and contain general information about the movie. Specific details are usually in the middle,
    # and will not be useful for our analysis

    # remove the words movie, film, etc
    text = [w for w in text if w not in ['movie', 'film', 'movies', 'films']]

    if len(text) > 20:
        text = text[:10] + text[-10:]

    return text

data['tokens'] = data['review'].apply(clean_text)

# print data info
print(data.info())

# print data
print(data.head())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
 2   tokens     50000 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB
None
                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                              tokens  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, br, br, filmin...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, theres, family, litt

In [13]:
review_tokens = data['tokens'].tolist()

# for testing, only use first 1000 reviews
review_tokens = review_tokens[:1000]

print(len(review_tokens))
print(review_tokens[0])

1000
['one', 'reviewers', 'mentioned', 'watching', 'oz', 'episode', 'youll', 'hooked', 'right', 'exactly', 'oz', 'may', 'become', 'comfortable', 'uncomfortable', 'viewingthats', 'get', 'touch', 'darker', 'side']


In [14]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

te = TransactionEncoder()
te_ary = te.fit(review_tokens).transform(review_tokens)
df = pd.DataFrame(te_ary, columns=te.columns_)
print(df.head())

   aaargh  aamir     ab  abbott  abhorrent  ability  abilitybr   able  \
0   False  False  False   False      False    False      False  False   
1   False  False  False   False      False    False      False  False   
2   False  False  False   False      False    False      False  False   
3   False  False  False   False      False    False      False  False   
4   False  False  False   False      False    False      False  False   

   abominable  abomination  ...   zidi  zingers  zodiac  zombie  \
0       False        False  ...  False    False   False   False   
1       False        False  ...  False    False   False   False   
2       False        False  ...  False    False   False   False   
3       False        False  ...  False    False   False    True   
4       False        False  ...  False    False   False   False   

   zombierelated  zombies  zoology     zp  zucker  zzzzzzzzzzzzzzzzzz  
0          False    False    False  False   False               False  
1          Fal

In [20]:
# apriori to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
print(frequent_itemsets)

# association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)
print(rules)

     support             itemsets
0      0.023         (absolutely)
1      0.011             (across)
2      0.027             (acting)
3      0.022             (action)
4      0.018             (actors)
..       ...                  ...
499    0.010    (watch, watching)
500    0.010      (ever, one, br)
501    0.010    (ever, one, seen)
502    0.012   (ever, one, worst)
503    0.014  (ever, worst, seen)

[504 rows x 2 columns]
      antecedents consequents  antecedent support  consequent support  \
0       (believe)      (cant)               0.022               0.034   
1          (read)      (book)               0.027               0.032   
2          (else)        (br)               0.019               0.268   
3         (waste)      (dont)               0.028               0.078   
4          (ever)       (one)               0.059               0.200   
5          (ever)     (worst)               0.059               0.042   
6         (worst)      (ever)               0.042        

In [17]:
# fp growth
from mlxtend.frequent_patterns import fpgrowth

frequent_itemsets_fp = fpgrowth(df, min_support=0.01, use_colnames=True)
print(frequent_itemsets_fp)

rules_fp = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.4)
print(rules_fp)

     support        itemsets
0      0.200           (one)
1      0.061           (get)
2      0.053      (watching)
3      0.026           (may)
4      0.015         (right)
..       ...             ...
499    0.012    (funny, one)
500    0.011    (book, read)
501    0.010   (one, horror)
502    0.012      (br, else)
503    0.010  (havent, seen)

[504 rows x 2 columns]
      antecedents consequents  antecedent support  consequent support  \
0      (favorite)       (one)               0.019               0.200   
1          (must)       (see)               0.034               0.120   
2       (believe)      (cant)               0.022               0.034   
3           (ive)      (seen)               0.035               0.074   
4           (ive)       (one)               0.035               0.200   
5         (worst)       (one)               0.042               0.200   
6         (worst)      (seen)               0.042               0.074   
7          (ever)     (worst)               