In [5]:
# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
from scipy.stats import randint, uniform

# Sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor  # For regression
from sklearn.ensemble import RandomForestClassifier  # For classification
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.neighbors import KNeighborsRegressor

pd.options.display.float_format = '{:.2f}'.format

## IMPORT & EXPLORE

In [6]:
import tensorflow_datasets as tfds

data_dir = tfds.core.constants.DATA_DIR
print("TFDS Data Directory:", data_dir)

TFDS Data Directory: C:\Users\User\tensorflow_datasets


In [26]:
import tensorflow_datasets as tfds

dataset_name = "movielens/1m-ratings"

# Reload from local storage (does not redownload)
movielens_data = tfds.load(dataset_name, split="train", as_supervised=False, data_dir="~/.tensorflow_datasets")

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\User\.tensorflow_datasets\movielens\1m-ratings\0.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\User\.tensorflow_datasets\movielens\1m-ratings\incomplete.GBIPFF_0.1.1\movielens-train.tfre…

[1mDataset movielens downloaded and prepared to C:\Users\User\.tensorflow_datasets\movielens\1m-ratings\0.1.1. Subsequent calls will reuse this data.[0m


In [28]:
data = [example for example in tfds.as_numpy(movielens_data)]

In [30]:
movielens_data = pd.DataFrame(data)

In [34]:
movielens_data.head(3)

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,35.0,"[0, 7]",b'3107',b'Backdraft (1991)',977432193,True,b'130',18,b'technician/engineer',5.0,b'50021'
1,25.0,[7],b'2114',"b'Outsiders, The (1983)'",965932967,False,b'3829',0,b'academic/educator',4.0,b'22307'
2,18.0,"[4, 15]",b'256',b'Junior (1994)',1012103552,False,b'1265',21,b'writer',1.0,b'49321'


In [43]:

# Now,  let's split whole dataset into 3 parts 

movielens_user_data = movielens_data[['user_id','user_zip_code','user_gender',
                                      'bucketized_user_age','user_occupation_label','user_occupation_text']]

movielens_movies_data = movielens_data[['movie_id','movie_genres','movie_title']]


movielens_ratings_data = movielens_data[['movie_id','user_id','user_rating']]

## CLEAN & PREPARE

##### HELPFUL FUNCTIONS

In [None]:
# let's extract year from the movielens_movies_data

import re

# Function to extract year from movie title
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title.decode('utf-8'))
    return int(match.group(1)) if match else None


movielens_movies_data['year'] = movielens_movies_data['movie_title'].apply(extract_year)


In [7]:
def fix_title_format(title):
    # Remove year from the title
    title = re.sub(r'\s*\(\d{4}\)$', '', title).strip()
    
    # Fix cases like "Spy Who Loved Me, The" → "The Spy Who Loved Me"
    match = re.match(r'^(.*),\s*(The|A|An)$', title)
    if match:
        title = f"{match.group(2)} {match.group(1)}"
    
    return title

In [9]:
def cleaner(df, 
            unneeded_columns=None, 
            date_columns=None, 
            date_format = None , 
            to_category_columns=None, 
            to_cat_code_columns = None , 
            outlier_columns=None):

    """ 
    This function get pandas DataFrame and attributes like 'date_columns' and 'to_category_columns' to manipulate ,
    clean and prepare dataset for usage.
    
    """
    
    # Convert specified columns to datetime
    if date_columns:
        for date_column in date_columns:
            if date_column in df.columns:
                df[date_column] = pd.to_datetime(df[date_column], format = date_format ,errors='coerce')

    # Convert specified columns to category
    if to_category_columns:
        for category_column in to_category_columns:
            if category_column in df.columns:
                df[category_column] = df[category_column].astype('category')
                
        if to_cat_code_columns:
            for column in to_cat_code_columns:
                df[f'{column}_code'] = df[column].cat.codes

    # Handle outliers using the IQR method
    if outlier_columns:
        for outlier_column in outlier_columns:
            if outlier_column in df.columns:
                iqr = df[outlier_column].quantile(0.75) - df[outlier_column].quantile(0.25)
                lower_bound = df[outlier_column].quantile(0.25) - (1.5 * iqr)
                upper_bound = df[outlier_column].quantile(0.75) + (1.5 * iqr)
                df = df[(df[outlier_column] >= lower_bound) & (df[outlier_column] <= upper_bound)]

     # Drop unnecessary columns
    if unneeded_columns:
        df = df.drop(columns=unneeded_columns)  # Use 'errors="ignore"' to avoid errors if columns don't exist.

    # Rename columns
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('(', '', regex=False)
    df.columns = df.columns.str.replace(')', '', regex=False)
    df.columns = df.columns.str.replace('-', '_', regex=False)
    df.columns = df.columns.str.replace('.', '', regex=False)
    df.columns = df.columns.str.replace('/', '', regex=False)
    df.columns = df.columns.str.replace('\\', '', regex=False)
    df.columns = df.columns.str.replace('%', '_percent', regex=False)
    
    return df

#### Merging with IMBD data

##### IMDB DATA IMPORT

In [11]:
# importing imdb data 

imdb_titles_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\title.basics.tsv.gz",sep = '\t')


  imdb_titles_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\title.basics.tsv.gz",sep = '\t')


In [13]:
imdb_ratings_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\title.ratings.tsv.gz",sep = '\t')

##### PREPARE MovieLens DATA TO MERGE

In [None]:
# Extract the title without the year
movielens_movies_data['clean_movie_title'] = movielens_movies_data['movie_title'].apply(fix_title_format)


In [27]:
imdb_titles_data = imdb_titles_data[(imdb_titles_data['titleType'] == 'movie')] # keep only information about movies

In [29]:
imdb_titles_data['startYear'] = pd.to_numeric(imdb_titles_data['startYear'], errors='coerce')
imdb_titles_data['startYear'] = imdb_titles_data['startYear'].astype('Int64')  # Keeps NaNs as null values

In [142]:
# Let's make first merge with IMDB titles data 

movielens_movies_extended_data = pd.merge(movielens_movies_data,
                                          imdb_titles_data,
                                          right_on = ['primaryTitle','startYear'],
                                          left_on = ['clean_movie_title','year'])

In [148]:
# Now, let's do some basic cleaning 

movielens_movies_extended_data = cleaner( df = movielens_movies_extended_data, 
                                          unneeded_columns=['movie_genres','primaryTitle','endYear',
                                                            'titleType','originalTitle','startYear',
                                                            ], 
                                          date_columns=None, 
                                          date_format = None , 
                                          to_category_columns= ['genres'], 
                                          to_cat_code_columns = ['genres'] , 
                                          outlier_columns=None )

In [28]:
# We also do this basic cleaning with prepared function with IMDB ratings data

imdb_ratings_data = cleaner( df = imdb_ratings_data, 
                                  unneeded_columns=None, 
                                  date_columns=None, 
                                  date_format = None , 
                                  to_category_columns=None, 
                                  to_cat_code_columns =None, 
                                  outlier_columns=None )

In [165]:
# Let's convert float to int if possible to save memory and speed up merging

for column in movielens_movies_extended_data[['year','isadult','runtimeminutes']]:

    movielens_movies_extended_data[column] = pd.to_numeric(movielens_movies_extended_data[column], errors='coerce')
    movielens_movies_extended_data[column] = movielens_movies_extended_data[column].astype('Int64')  # Keeps NaNs as null values

In [36]:
# let's also add new extra features 

movielens_movies_extended_data = pd.merge(movielens_movies_extended_data,
                                          imdb_ratings_data)


In [23]:
movielens_user_data['user_zip_code'] = (
    movielens_user_data['user_zip_code']
    .astype(str)  # Convert bytes to string
    .str.extract(r"b'(\d+)'")  # Extract only numeric part
    .astype('float')  # Convert to numbers (or NaN if extraction fails)
)


In [25]:
movielens_user_data['bucketized_user_age'] = movielens_user_data['bucketized_user_age'].astype('int')

##### Combine data into general dataset

In [23]:
def clean_bytes(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype(str).str.strip("b'")  # Remove "b'" from byte strings
    return df

# Apply function to all DataFrames
movielens_movies_extended_data = clean_bytes(movielens_movies_extended_data)
movielens_ratings_data = clean_bytes(movielens_ratings_data)
movielens_user_data = clean_bytes(movielens_user_data)

Note : In order to collect data which can be used in our task , we must keep in mind potential sizes of our final , combined dataset which can be 10 - 20 times larget than original user-ratings data. To decrease time and memory consuming , i will only use random subsamples without replacement of the original datasets.

In [29]:
movielens_movies_extended_data_40_percent_sample = movielens_movies_extended_data.sample(frac = 0.4,
                                                                                         replace = False,
                                                                                         random_state = 42)

In [31]:
movielens_ratings_data_40_percent_sample = movielens_ratings_data.sample(frac = 0.4,
                                                                         replace = False,
                                                                         random_state = 42)

In [33]:
movielens_user_data_40_percent_sample = movielens_user_data.sample(frac = 0.4,
                                                                   replace = False,
                                                                   random_state = 42)

In [35]:
movielens_user_movies_sample_interaction_data = pd.merge( movielens_ratings_data_40_percent_sample, 
                                                          movielens_movies_extended_data_40_percent_sample ).drop_duplicates()

In [37]:
movielens_user_movies_sample_interaction_data = pd.merge( movielens_user_movies_sample_interaction_data ,
                                                          movielens_user_data_40_percent_sample ).drop_duplicates()                                     

In [59]:
movielens_user_movies_sample_interaction_data.sample(5)

Unnamed: 0,movie_id,user_id,user_rating,movie_title,year,clean_movie_title,tconst,isadult,runtimeminutes,genres,genres_code,averagerating,numvotes,user_zip_code,user_gender,bucketized_user_age,user_occupation_label,user_occupation_text
50844381,514,2271,3.0,"Ref, The (1994)",1994,The Ref,tt0110955,0,97.0,"Comedy,Crime,Drama",144,6.9,27501,13210.0,True,50,14,sales/marketing
18818271,3113,1737,4.0,End of Days (1999),1999,End of Days,tt0146675,0,122.0,"Action,Fantasy,Horror",48,5.8,118794,46614.0,True,35,21,writer
26752435,1527,173,3.0,"Fifth Element, The (1997)",1997,The Fifth Element,tt0119116,0,126.0,"Action,Adventure,Sci-Fi",12,7.6,521774,45237.0,True,25,11,other/not specified
2229873,2026,5675,4.0,Disturbing Behavior (1998),1998,Disturbing Behavior,tt0134619,0,84.0,"Horror,Mystery,Sci-Fi",311,5.6,26119,30030.0,True,35,14,sales/marketing
46246957,225,999,4.0,Disclosure (1994),1994,Disclosure,tt0109635,0,128.0,"Drama,Thriller",290,6.2,55920,62558.0,True,25,15,scientist


Now , we have this combined user-item interaction like dataset , it an be used to build models