# **Preprocessing**

In [None]:
import pandas as pd
from google.colab import files

# Load the dataset
df = pd.read_csv('ratings.csv')

# Remove duplicates
df.drop_duplicates(keep='first', inplace=True)

# Handle null values
df.dropna(inplace=True)

# Remove timestamp column if it exists
if 'timestamp' in df.columns:
    df.drop(columns=['timestamp'], inplace=True)

# Ensure ratings are in integer form
if 'rating' in df.columns:
    df['rating'] = df['rating'].astype(int)

# Encode categorical variables if needed
if 'product_category' in df.columns:
    df = pd.get_dummies(df, columns=['product_category'], drop_first=True)

# Save the preprocessed data to a new CSV file
df.to_csv('preprocessed_amazon_movie_ratings.csv', index=False)

# Zip and download the file
!zip preprocessed_data.zip preprocessed_amazon_movie_ratings.csv
files.download('preprocessed_data.zip')


  adding: preprocessed_amazon_movie_ratings.csv (deflated 71%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Creating user-item **matrix**

In [None]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('ratings.csv')


data = data.drop_duplicates(subset=['userId', 'rating'])

# Convert 'rating' to integer after filling NaNs with 0 if necessary
data['rating'] = data['rating'].fillna(0).astype(int)

# Convert 'timestamp' from UNIX format to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')

# Create the user-item matrix without filling NaNs after pivoting
user_item_matrix = data.pivot_table(index='userId', columns='movieId', values='rating')

# Fill all NaN values with random integers between 1 and 5 for demonstration
user_item_matrix = user_item_matrix.applymap(lambda x: np.random.randint(1, 6) if pd.isna(x) else x)

# Select a random subset (8x8) of the matrix
random_user_ids = np.random.choice(user_item_matrix.index, 8, replace=False)
random_movie_ids = np.random.choice(user_item_matrix.columns, 8, replace=False)
user_item_matrix_subset = user_item_matrix.loc[random_user_ids, random_movie_ids]


num_nan_values = np.random.choice([5, 6])
nan_indices = [(np.random.choice(user_item_matrix_subset.index), np.random.choice(user_item_matrix_subset.columns)) for _ in range(num_nan_values)]
for row, col in nan_indices:
    user_item_matrix_subset.at[row, col] = np.nan

# Display the subset with the title
print("8x8 User-Item Matrix:")
print(user_item_matrix_subset)

  user_item_matrix = user_item_matrix.applymap(lambda x: np.random.randint(1, 6) if pd.isna(x) else x)


8x8 User-Item Matrix:
movieId  2273  1241  2533  1717  230   4069  2066  728 
userId                                                 
304       2.0   NaN   1.0   NaN   1.0   4.0   1.0   1.0
388       5.0   3.0   1.0   2.0   NaN   4.0   2.0   2.0
220       1.0   4.0   1.0   NaN   4.0   4.0   1.0   3.0
289       3.0   4.0   4.0   0.0   2.0   5.0   3.0   4.0
628       4.0   5.0   2.0   2.0   4.0   4.0   4.0   3.0
158       3.0   2.0   3.0   3.0   2.0   2.0   1.0   5.0
397       4.0   2.0   1.0   4.0   1.0   1.0   2.0   NaN
37        2.0   3.0   4.0   5.0   4.0   2.0   2.0   2.0


# **computing average rating**

In [None]:
import pandas as pd

# Load the dataset
ratings_df = pd.read_csv('ratings.csv')  # Update with the correct path to your file

# Display the first few rows to ensure it's loaded correctly
print(ratings_df.head())
# Calculate the average rating
average_rating = ratings_df['rating'].mean()  # Make sure 'rating' matches the column name in your dataset

# Print the average rating
print(f"Average Rating: {average_rating:.2f}")



   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205
Average Rating: 3.54
