In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
#import of review data
cols = ["user id",
        "item id",
        "rating",
        "timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
df_data = pd.read_csv("ml-100k/u.data",sep="\t",names=cols,header=None,encoding="ISO-8859-1")

In [3]:
#import of moviedata
cols = ["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western"]

df_movie = pd.read_csv("ml-100k/u.item",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [4]:
#import of user data
cols = ["user id",
        "age",
        "gender",
        "occupation",
        "zip code"]
df_user = pd.read_csv("ml-100k/u.user",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [5]:
#frequency binning the ages into age groups as it will be easier for future analysis
df_user['age_group'] = pd.qcut(df_user['age'],q=10,precision=0)

In [6]:
df = pd.merge(pd.merge(df_data,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie.drop(['IMDb URL'],axis=1),
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')

In [7]:
def release_year(row):
    return str(row['release date'])[-4:]
    
def release_month(row):
    return str(row['release date'])[3:6]
def release_date(row):
    return str(row['release date'])[:2]

df['release_year'] =  df.apply(lambda row: release_year(row), axis=1)
df = df.query("release_year != 'nan'")
df['release_year'] = df['release_year'].astype(int)
df['release_month'] =  df.apply(lambda row: release_month(row), axis=1)
df['release_day'] =  df.apply(lambda row: release_date(row), axis=1)

df = df.drop(['release date'],axis=1)

In [8]:
df['release_year'] = pd.cut(df['release_year'],bins=20)

In [9]:
df.drop(['user id',
         'item id',
         'timestamp',
         'movie id',
         'movie title',
         'video release date',
         'release_day',
         'release_month'
#          'release_year'
        ],
       axis=1,
       inplace=True)

(99991, 24)

In [10]:

# df = df.query("release_month != ''")
# df = df.query("release_month != 'eb-'")
df = df.query("unknown != '1'")

(99990, 24)

In [11]:
df_dummies = pd.get_dummies(df['age_group'], prefix = 'age_group')
df = pd.concat([df, df_dummies], axis=1)
df_dummies = pd.get_dummies(df['gender'], prefix = 'gender')
df = pd.concat([df, df_dummies], axis=1)
df_dummies = pd.get_dummies(df['occupation'], prefix = 'occupation')
df = pd.concat([df, df_dummies], axis=1)
df_dummies = pd.get_dummies(df['release_year'], prefix = 'release_year')
df = pd.concat([df, df_dummies], axis=1)
# df_dummies = pd.get_dummies(df['release_month'], prefix = 'release_month')
# df = pd.concat([df, df_dummies], axis=1)

df.drop(['age_group',
         'gender',
         'occupation',
         'unknown',
         'release_year',
#          'release_month'
        ], 
        axis=1,
        inplace=True)

In [12]:
train, test = train_test_split(df, test_size=0.1)

y_train = train.rating
x_train = train.drop(['rating'],axis=1)
y_test = test.rating
x_test = test.drop(['rating'],axis=1)


In [13]:
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)

total = 0
correct = 0
for i in range(len(y_pred)):
    if y_pred[i]==y_test.iloc[i]:
        correct+=1
    total+=1
    
print('Accuracy: ',correct/total)

Accuracy:  0.3615361536153615


In [14]:
rfm = RandomForestClassifier(n_estimators=50, oob_score=70, n_jobs=-1,
                            random_state=101,max_features=None, min_samples_leaf=30)
rfm.fit(x_train,y_train)
y_pred=rfm.predict(x_test)

total = 0
correct = 0

for i in range(len(y_pred)):
    if y_pred[i]==y_test.iloc[i]:
        correct+=1
    total+=1
    
print('Accuracy: ',correct/total)

Accuracy:  0.3844384438443844


In [15]:
dtree = DecisionTreeClassifier(max_depth=100, random_state=101,
                              max_features=None, min_samples_leaf=200)
dtree.fit(x_train,y_train)
y_pred=dtree.predict(x_test)

total = 0
correct = 0

for i in range(len(y_pred)):
    if y_pred[i]==y_test.iloc[i]:
        correct+=1
    total+=1
    
print('Accuracy: ',correct/total)

Accuracy:  0.36973697369736974
