In [None]:
import pandas as pd
import numpy as np
import wikipedia

In [None]:
wikipedia.__version__

In [None]:
data_cols = ['user id','movie id','rating','timestamp']
data = pd.read_csv('../data/ml-100k/u.data', sep='\t', header=None, names=data_cols)
data.head()

In [None]:
item_cols = ['movie id',  'movie title', 'release date', 'video release date','IMDb URL', 'unknown',\
'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama','Fantasy', \
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance','Sci-Fi', 'Thriller', 'War', 'Western']
item = pd.read_csv('../data/ml-100k/u.item',  sep='|',header=None,names=item_cols,encoding='latin-1')
item.head()

In [None]:
# Assuming 'df' is your DataFrame
genre_columns = ['Action', 'Adventure', 'Animation', 'Children', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Function to get genres for a row
def get_genres(row):
    genres = [genre for genre in genre_columns if row[genre] == 1]
    return ', '.join(genres) if genres else 'unknown'

# Apply the function to each row
item['Genres'] = item.apply(get_genres, axis=1)
item.head()

In [None]:
item = item[['movie id', 'movie title', 'Genres']]
item.head()

In [None]:
# merge data and item
df = pd.merge(data, item, on='movie id').sort_values(by=['user id', 'movie id']).reset_index(drop=True)
df.head()

In [None]:
# average rating for each movie for each user, add the column to df
df['avg_rating'] = df.groupby(['user id', 'movie id'])['rating'].transform('mean')
# rename the column
df.columns = ['user_id', 'movie_id', 'rating', 'timestamp', 'movie_title', 'genres', 'avg_rating']
df.head()

In [None]:
df.to_csv('../data/processed_movie100k.csv', index=False)

### Add wiki summary

In [None]:
movie100k = pd.read_csv('D:/test/LLM-Recommender-System-with-RAG/data/processed_movie100k.csv')
movie100k.head()
print(movie100k.shape)

In [None]:
movie_data = []

for title in np.unique(movie100k['movie_title']):
    try:
        # Formatting the title for Wikipedia
        title_formatted = title.replace("'", "").rstrip()
        if "(" in title_formatted and ")" in title_formatted:
            title_formatted = title_formatted.replace(")", " film)")  # Add 'film' before the closing parenthesis
        else:
            title_formatted += " (film)"

        # Fetching the summary
        summary = wikipedia.summary(title_formatted, sentences=1)
        movie_data.append({'movie_title': title, 'wiki_summary': summary})
    except wikipedia.DisambiguationError as e:
        # Handle disambiguation by looking for a title containing 'film'
        relevant_page = next((option for option in e.options if "film" in option and title.split(" (")[0] in option), None)
        if relevant_page:
            try:
                summary = wikipedia.summary(relevant_page, sentences=1)
                movie_data.append({'movie_title': title, 'wiki_summary': summary})
            except Exception as ex:
                print(f"Failed to fetch summary for {title}: {ex}")
        else:
            print(f"No relevant Wikipedia page found for {title}")
    except Exception as ex:
        print(f"Failed to fetch summary for {title}: {ex}")

# Creating the DataFrame after collecting all data
movie_wiki = pd.DataFrame(movie_data)

In [None]:
movie_wiki.head()

In [None]:
movie_wiki.to_csv('D:/test/LLM-Recommender-System-with-RAG/data/movie_wiki.csv', index=False)