Vilane MT 22329057

In [1]:
!wget -q -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip # 1) download and unzip MovieLens 100k

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [None]:
# 2) basic imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)


In [None]:
# 3) load the files (u.data, u.user, u.item)
ratings_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, engine='python')


# New Section

In [None]:
print("\n=== Ratings Dataset Preview ===")
print(ratings.head())
print(ratings.info())
print(ratings.columns)


=== Ratings Dataset Preview ===
   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
None
Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')


In [None]:
users_cols = ['user_id','age','gender','occupation','zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, engine='python')

In [None]:
print("\n=== Users Dataset Preview ===") #view
print(users.head())
print(users.info())
print(users.columns)



=== Users Dataset Preview ===
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB
None
Index(['user_id', 'age', 'gender', 'occupation', 'zip_code'], dtype='object')


In [None]:
# u.item has 5 metadata cols + 19 genre flags (pipe-separated). Provide genre names:
genres = ['unknown','Action','Adventure','Animation',"Children's",'Comedy','Crime','Documentary',
          'Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
item_cols = ['movie_id','title','release_date','video_release_date','IMDb_URL'] + genres
movies = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1', engine='python')


In [None]:
print("\n=== Movies Dataset Preview ===")
print(movies.head())
print(movies.info())
print(movies.columns)


=== Movies Dataset Preview ===
   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb_URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   

   Adventure  Animation  Children's  ...  Fantasy  Film-Noir  Horror  Musical  \
0          0          1           1  ...     

In [None]:
# 4) quick merge to create one flat table for modeling
df = ratings.merge(users, on='user_id', how='left').merge(movies, on='movie_id', how='left')

In [None]:
print("\n=== Merged Dataset Preview ===") #view
print(df.head())
print(df.info())
print(df.columns)



=== Merged Dataset Preview ===
   user_id  movie_id  rating  timestamp  age gender  occupation zip_code  \
0      196       242       3  881250949   49      M      writer    55105   
1      186       302       3  891717742   39      F   executive    00000   
2       22       377       1  878887116   25      M      writer    40206   
3      244        51       2  880606923   28      M  technician    80525   
4      166       346       1  886397596   47      M    educator    55113   

                        title release_date  ...  Fantasy Film-Noir  Horror  \
0                Kolya (1996)  24-Jan-1997  ...        0         0       0   
1    L.A. Confidential (1997)  01-Jan-1997  ...        0         1       0   
2         Heavyweights (1994)  01-Jan-1994  ...        0         0       0   
3  Legends of the Fall (1994)  01-Jan-1994  ...        0         0       0   
4         Jackie Brown (1997)  01-Jan-1997  ...        0         0       0   

   Musical  Mystery  Romance  Sci-Fi  Thri

In [None]:
# 5) create binary target: liked if rating >= 4
df['liked'] = (df['rating'] >= 4).astype(int)


In [None]:
# 6) feature engineering (simple, beginner-friendly features)
# a) user features: age (keep), gender -> numeric
df['gender_num'] = df['gender'].map({'M':1, 'F':0})

In [None]:
# b) movie year: extract year from title (e.g., "Toy Story (1995)")
import re
def extract_year(title):
    m = re.search(r'\((\d{4})\)', str(title))
    return int(m.group(1)) if m else np.nan

In [None]:
df['movie_year'] = df['title'].apply(extract_year)


In [None]:
# c) genre_count (how many genres the movie has)
df['genre_count'] = df[genres].sum(axis=1)


In [None]:
# d) movie and user average ratings (use group transforms so each row gets the avg)
df['movie_avg_rating'] = df.groupby('movie_id')['rating'].transform('mean')
df['user_avg_rating'] = df.groupby('user_id')['rating'].transform('mean')


In [None]:
# 7) select features and handle missing values
features = ['age','gender_num','movie_year','genre_count','movie_avg_rating','user_avg_rating']
X = df[features].copy()
y = df['liked']

In [None]:
# fill or impute missing movie_year with median (simple)
X['movie_year'] = X['movie_year'].fillna(X['movie_year'].median())
X = X.fillna(X.median())

In [None]:
# 8) train/test split (stratify to keep class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

In [None]:
# 9) Logistic Regression (scale numeric features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
y_proba_lr = lr.predict_proba(X_test_scaled)[:,1]


In [None]:
# 10) Random Forest (doesn't require scaling)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]


In [None]:
# 11) evaluation helper
def print_metrics(name, y_true, y_pred, y_proba):
    print(f"=== {name} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1:", f1_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_proba))
    print()

print_metrics("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
print_metrics("Random Forest", y_test, y_pred_rf, y_proba_rf)


=== Logistic Regression ===
Accuracy: 0.71375
Precision: 0.7207824364476725
Recall: 0.7885327313769752
F1: 0.7531369927989306
ROC AUC: 0.7821243242217881

=== Random Forest ===
Accuracy: 0.69635
Precision: 0.7163869181519295
Recall: 0.7476297968397291
F1: 0.7316749878496002
ROC AUC: 0.7615480850579509



In [None]:
# 12) feature importances (from Random Forest)
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
print("Random Forest feature importances:\n", importances)

Random Forest feature importances:
 movie_avg_rating    0.359325
user_avg_rating     0.311004
age                 0.152410
movie_year          0.112531
genre_count         0.049325
gender_num          0.015404
dtype: float64
