Setup

In [89]:
from google.colab import drive
import pandas as pd

'''
Test data recorded from Matt's private BeefBeef server

Description of training data from University of Minnesota:
  These files contain 1,000,209 anonymous ratings of approximately 3,900 movies
  made by 6,040 MovieLens users who joined MovieLens in 2000.
'''
drive.mount('/content/drive')
test_path = '/content/drive/My Drive/matt_beefbeef_server/out.csv'
train_path = '/content/drive/My Drive/matt_beefbeef_server/ratings.csv'

test = pd.read_csv(test_path)
train = pd.read_csv(train_path, delimiter = '\t')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Examine test data

In [91]:
# Get table size
def tablesize():
  print(f'Rows:  {test.shape[0]}', f'\nColumns:  {test.shape[1]}')


# Find date ranges in data
def timeinterval():
  test['DateCreated'] = pd.to_datetime(test['DateCreated'])

  earliest = (test['DateCreated'].min()).strftime('%d %B %Y')
  latest = (test['DateCreated'].max()).strftime('%d %B %Y')

  print(f'\nShowing data from: \n>  {earliest} \nto \n>  {latest}')


# See data features
def column_names():
  print('\nColumn names:')
  for column in test.columns:
    print('>  ', column)


# Basic descriptive stuff
def user_movie_unique():
  print('\nUsers:  ', test['UserId'].nunique())
  print('Movies:', test['ItemName'].nunique())


tablesize()
timeinterval()
column_names()
user_movie_unique()

test = test.iloc[:, [1, 4]]
test.head()

Rows:  2099 
Columns:  9

Showing data from: 
>  15 May 2023 
to 
>  21 October 2023

Column names:
>   DateCreated
>   UserId
>   ItemId
>   ItemType
>   ItemName
>   PlaybackMethod
>   ClientName
>   DeviceName
>   PlayDuration

Users:   13
Movies: 1066


Unnamed: 0,UserId,ItemName
0,a31bac6c6edf406b9d2031d8de86f0d0,Bottle Rocket
1,a31bac6c6edf406b9d2031d8de86f0d0,Bottle Rocket
2,a31bac6c6edf406b9d2031d8de86f0d0,Bottle Rocket
3,3687fd7ac2444e43853b01d6f6540ad9,Ghost in the Shell
4,3687fd7ac2444e43853b01d6f6540ad9,Ghost in the Shell


Examine train data

In [74]:
train.head()

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561
