In [33]:
import pandas as pd
import numpy as np
import math
import re
import matplotlib.pyplot as plt

In [None]:
import pandas as pd

# ✅ Load movie_titles.csv (Read only 3 columns, ignore extra commas)
movies_df = pd.read_csv(
    "movie_title.csv", 
    delimiter=",", 
    usecols=[0, 1, 2],  # Read only first 3 columns
    names=["movie_id", "year", "title"], 
    encoding="ISO-8859-1", 
    dtype=str  # Read as string first to handle issues
)

# ✅ Clean `movie_id` column
movies_df["movie_id"] = movies_df["movie_id"].str.strip()  # Remove spaces
movies_df = movies_df[movies_df["movie_id"].str.isdigit()]  # Keep only valid numbers
movies_df["movie_id"] = movies_df["movie_id"].astype(int)  # Convert to integer

# ✅ Load combined_data_1.txt (Ratings Data)
ratings_list = []
with open("combined_data_1.txt", "r", encoding="ISO-8859-1") as file:
    movie_id = None
    for line in file:
        line = line.strip()
        if line.endswith(":"):  # Movie ID line
            movie_id = line[:-1].strip()  # Remove colon
            movie_id = int(movie_id) if movie_id.isdigit() else None  # Validate
        elif movie_id is not None:
            parts = line.split(",")
            if len(parts) == 3:
                try:
                    cust_id, rating, _ = parts  # Ignore date
                    ratings_list.append([int(cust_id), int(rating), movie_id])
                except ValueError:
                    continue  # Skip invalid lines

# ✅ Convert ratings data to DataFrame
ratings_df = pd.DataFrame(ratings_list, columns=["cust_id", "rating", "movie_id"])

# ✅ Merge datasets
merged_df = ratings_df.merge(movies_df, on="movie_id", how="left")

# ✅ Handle missing values in movie details
merged_df["year"] = merged_df["year"].fillna("Unknown")
merged_df["title"] = merged_df["title"].fillna("Unknown Movie")

# ✅ Format output
merged_df["moviename_year"] = merged_df["title"] + " (" + merged_df["year"] + ")"
final_df = merged_df[["cust_id", "rating", "moviename_year"]]

# ✅ Print sample output
print(final_df.head())

# ✅ Save final merged data
final_df.to_csv("final_merged_data.csv", index=False, encoding="utf-8")


In [None]:

final_df.info()

In [None]:

final_df = final_df.drop(columns=['rating']) # remove ratings col
final_df.info()

In [None]:
pt = pd.pivot_table(final_df, values=['moviename_year'],
                    index = 'cust_id',
                    aggfunc={'moviename_year': list})

In [None]:
pt

In [None]:

ls_movies_percust = list(pt['moviename_year'])
ls_movies_percust

In [None]:

len(ls_movies_percust)

In [None]:

max(len(elem) for elem in ls_movies_percust)

In [None]:
!pip install mlxtend

In [None]:

from mlxtend.preprocessing import TransactionEncoder
trans_en = TransactionEncoder()
trans_array = trans_en.fit(ls_movies_percust).transform(ls_movies_percust)
df_eclat = pd.DataFrame(trans_array, columns = trans_en.columns_)

In [None]:

df_eclat[:5].dtypes

In [None]:

pd.set_option("display.max_rows", None, "display.max_columns", None)
df_eclat.head()

In [None]:

# count items in each column
items_total = df_eclat.sum(axis=0)
items_total

In [None]:

# count items in each row
items_per_transaction = df_eclat.sum(axis=1)
items_per_transaction

In [None]:
type(items_per_transaction)

In [None]:
from mlxtend.frequent_patterns import apriori
import pandas as pd

# Reduce dataset size by sampling (adjust fraction as needed)
df_eclat_sampled = df_eclat.sample(frac=0.1, random_state=42)

# Drop infrequent columns (items appearing in very few transactions)
min_item_occurrence = 10  # Adjust based on dataset
df_eclat_sampled = df_eclat_sampled.loc[:, df_eclat_sampled.sum() > min_item_occurrence]

# Run Apriori with memory optimization
df_frequent_itemsets = apriori(
    df_eclat_sampled, 
    min_support=0.2,  # Increase min_support to reduce memory usage
    use_colnames=True, 
    low_memory=True,  # Enables memory-efficient processing
    max_len=2  # Prevents explosion in combinations
)

# Display results
print(df_frequent_itemsets.head())


In [None]:

df_frequent_itemsets