In [133]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import string
from fim import fpgrowth, apriori
import spacy
from spacy.lang.en import stop_words
nlp = spacy.load('en_core_web_sm')

In [134]:
data = {
    "id": [1, 2, 3, 4, 5],
    "review": [
        "The movie was fantastic! The plot was engaging and the acting was top-notch.",
        "I hated the movie. The plot was boring and predictable, and the acting was terrible.",
        "What a great experience! The cinematography was stunning and the direction was superb.",
        "The film was a waste of time. It lacked depth and the characters were uninteresting.",
        "Absolutely loved it! Brilliant performances and a powerful story."
    ]
}

df = pd.DataFrame(data)
# df = pd.read_csv('YoutubeCommentsDataSet.csv')
# df.drop(columns=['Sentiment'], inplace=True)
# df.columns = ['review']
df.head()

Unnamed: 0,id,review
0,1,The movie was fantastic! The plot was engaging...
1,2,I hated the movie. The plot was boring and pre...
2,3,What a great experience! The cinematography wa...
3,4,The film was a waste of time. It lacked depth ...
4,5,Absolutely loved it! Brilliant performances an...


In [135]:
# Lista simple de stopwords
# basic_stopwords = {
#     "the", "was", "and", "a", "of", "to", "it", "i", "is", "in"
# }

stop_words = stop_words.STOP_WORDS
punct = set(string.punctuation)

In [136]:
def preprocess(text):
    tokens = text.lower().translate(str.maketrans('', '', string.punctuation)).split()
    return list(set([w for w in tokens if w not in stop_words and w.isalpha()]))

df["transaction"] = df["review"].apply(preprocess)
transactions = df["transaction"].tolist()
for i, t in enumerate(transactions):
    print(f"{i+1}: {t}")

1: ['engaging', 'topnotch', 'movie', 'plot', 'fantastic', 'acting']
2: ['predictable', 'terrible', 'boring', 'movie', 'plot', 'acting', 'hated']
3: ['direction', 'superb', 'great', 'stunning', 'cinematography', 'experience']
4: ['characters', 'time', 'waste', 'film', 'uninteresting', 'depth', 'lacked']
5: ['story', 'performances', 'loved', 'powerful', 'brilliant', 'absolutely']


In [137]:
# Itemsets frecuentes con soporte mínimo de 2
r = fpgrowth(transactions, supp=10)
df = pd.DataFrame(r)
df.columns = ['Itemset', 'Support']

df.sort_values(by='Support', ascending=False, inplace=True)
df

Unnamed: 0,Itemset,Support
0,"(movie,)",2
1,"(plot, movie)",2
2,"(plot,)",2
3,"(acting, movie, plot)",2
4,"(acting, movie)",2
...,...,...
431,"(absolutely, loved)",1
432,"(absolutely, story, performances)",1
433,"(absolutely, story)",1
434,"(absolutely, performances)",1


In [138]:
r = fpgrowth(transactions, target='r', supp=20, report='aSc')
df = pd.DataFrame(r)
df.columns = ['Consecuente', 'Antecedente', 'Freq', 'Freq(%)', 'Conf']
df.sort_values(by='Conf', ascending=False, inplace=True)
df

Unnamed: 0,Consecuente,Antecedente,Freq,Freq(%),Conf
0,movie,"(plot,)",2,40.0,1.0
1,plot,"(movie,)",2,40.0,1.0
2,movie,"(acting,)",2,40.0,1.0
3,acting,"(movie,)",2,40.0,1.0
4,movie,"(acting, plot)",2,40.0,1.0
...,...,...,...,...,...
1377,powerful,"(absolutely, brilliant)",1,20.0,1.0
1378,brilliant,"(absolutely, powerful)",1,20.0,1.0
1379,absolutely,"(brilliant, powerful)",1,20.0,1.0
1380,brilliant,"(absolutely,)",1,20.0,1.0


In [139]:
from ortools.linear_solver import pywraplp

In [140]:
# Convertir las reglas a estructura manejable
rules = fpgrowth(transactions, target='r', supp=20, report='aSc')
rule_data = [
    {"ante": r[0], "cons": r[1], "support": r[2], "conf": r[3]}
    for r in rules if r[3] >= 0.5  # opcional: filtrar confianza mínima
]

solver = pywraplp.Solver.CreateSolver("SCIP")
x = [solver.BoolVar(f"x_{i}") for i in range(len(rule_data))]

# Maximizar la suma de la confianza de las reglas seleccionadas
solver.Maximize(solver.Sum(rule_data[i]["conf"] * x[i] for i in range(len(rule_data))))

# Restringimos a seleccionar máximo 5 reglas
solver.Add(solver.Sum(x) <= 5)

# Resolver
status = solver.Solve()

# Mostrar reglas seleccionadas
if status == pywraplp.Solver.OPTIMAL:
    print("=== Reglas seleccionadas ===")
    for i in range(len(rule_data)):
        if x[i].solution_value() > 0:
            r = rule_data[i]
            print(f"{r['ante']} => {r['cons']} | Soporte: {r['support']}%, Confianza: {r['conf']}")
else:
    print("No se encontró solución óptima.")

=== Reglas seleccionadas ===
movie => ('plot',) | Soporte: 2%, Confianza: 40.0
plot => ('movie',) | Soporte: 2%, Confianza: 40.0
movie => ('acting',) | Soporte: 2%, Confianza: 40.0
acting => ('movie',) | Soporte: 2%, Confianza: 40.0
movie => ('acting', 'plot') | Soporte: 2%, Confianza: 40.0
