# Task 1: Movie Recommendation
## Subtask 1: Data Loading and Data Preparation

In [1]:
import numpy as np
import pandas as pd
import h5py
from tqdm import tqdm

from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import copy


In [2]:
# reading csv files
def loadData():
    colnamesMovies=['MovieID', 'Titel', 'Genres'] 
    movies =  pd.read_csv('movies.dat', sep="::", skiprows=0, engine='python', names=colnamesMovies, encoding='latin').to_numpy()

    colnamesRating=['UserID', 'MovieID', 'Rating', 'Timestamp'] 
    ratings =  pd.read_csv('ratings.dat', sep="::", skiprows=0, engine='python', names=colnamesRating)

    colnamesUsers=['UserID', 'Gender', 'Age', 'Occupation', 'ZIP'] 
    users =  pd.read_csv('users.dat', sep="::", skiprows=0, engine='python', names=colnamesUsers).to_numpy()
    return movies, ratings, users


In [3]:
genrePositions = dict([
    ('Action', '0'),
    ('Adventure', '1'),
    ('Animation', '2'),
    ('Children\'s', '3'),
    ('Comedy', '4'),
    ('Crime', '5'),
    ('Documentary', '6'),
    ('Drama', '7'),
    ('Fantasy', '8'),
    ('Film-Noir', '9'),
    ('Horror', '10'),
    ('Musical', '11'),
    ('Mystery', '12'),
    ('Romance', '13'),
    ('Sci-Fi', '14'),
    ('Thriller', '15'),
    ('War', '16'),
    ('Western', '17')
])

In [4]:
def constructData(movies, ratings, users):
    
    #decode genres in to individual Pandadf columns  
    genres = np.zeros([movies.shape[0], 18])
    #loop over every movie
    for i, eachMovie in enumerate(movies):   
        #split the single  genres of a movie
        genresPerson = (eachMovie[2].split("|"))
        #set the corresponding column to 1 if the movie belongs to the genre
        for eachGenre in genresPerson:
            genres[i,int(genrePositions.get(eachGenre))] = 1
    
    #cast to pandas
    genresDf = pd.DataFrame(genres, columns = ['Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'])
    genresDf['MovieID'] =  movies[:,0]  
    
    #decode occupations in to individual Pandadf colloumns 
    ocupations = np.zeros([users.shape[0], 21])
    for i, user in enumerate(users):
        ocupations[i,int(user[3])-1] = 1
    #cast to pandas
    ocupationsDf = pd.DataFrame(ocupations, columns = ['other', 'academic/educator', 'artist', 'clerical/admin', 'college/grad student', 'customer service', 'doctor/health care', 'executive/managerial', 'farmer', 'homemaker',"K-12 student", 'lawyer', 'programmer', 'retired','sales/marketing','scientist','self-employed','technician/engineer','tradesman/craftsman', 'unemployed', 'writer'])
    ocupationsDf['UserID'] =  users[:,0]  
    
    #casting movies and users into dict for better data combination
    moviesTitel = {movie[0]:movie[1] for movie in movies}
    usersGender = {user[0]:user[1] for user in users}
    usersAge = {user[0]:user[2] for user in users}
    
    #Creating final dataframe 
    # dicts with the mapping command and merge df into each other
    ratings['Movie Titel'] = ratings['MovieID'].map(moviesTitel)
    ratings = pd.merge(ratings,genresDf, on='MovieID')
    ratings['Gender'] = ratings['UserID'].map(usersGender)
    ratings['Age'] = ratings['UserID'].map(usersAge)
    ratings = pd.merge(ratings,ocupationsDf, on='UserID')
    
    #enable better model training by working with integer
    ratings.loc[ratings["Gender"] == "M", "Gender"] = 2
    ratings.loc[ratings["Gender"] == "F", "Gender"] = 1
    
    
    #remove users with less then 100 rtings 
    byUserID = ratings.groupby('UserID').aggregate(np.count_nonzero)
    tags = byUserID[byUserID.MovieID >= 100].index
    ratings = (ratings[ratings['UserID'].isin(tags)])
    
    #sorted by user ID to enable easy train test split
    ratings = ratings.sort_values('UserID')        

    #delete not needed features
    del ratings['MovieID']
    del ratings['Timestamp']
    del ratings['Movie Titel']

    #Train test split
    testData = ratings.loc[ratings['UserID'] <= 1000]
    del testData['UserID']
    trainData = ratings.loc[ratings['UserID'] >= 1000]
    del trainData['UserID']

    
    #separating the labels from the corresponding features
    trainData = trainData.to_numpy().astype('float64')  
    testData = testData.to_numpy().astype('float64')  
    X_test = testData[:,1:]
    y_test = testData[:,0]
    X_train = trainData[:,1:]
    y_train = trainData[:,0]
    return X_test, y_test, X_train, y_train


In [5]:
movies, ratings, users = loadData() #loading Data

In [6]:
X_test, y_test, X_train, y_train = constructData(movies, ratings, users) # constructing Data

In [7]:
print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)

(128025, 41)
(128025,)
(719277, 41)
(719277,)


In the report, describe the feature vectors you created. Explain how you represented features and why? <br>
Age, Gender, Occupation, and Genre are kept as features. Occupation and Genres however are split up into a feature for every possible vlaule. This separation is done to enable easier training to put additional weight on these important features. <br>
How many samples do your training and test data contain? <br>
The Dataset has 128025 Test samples and 719277 training samples.

## Subtask 2: Basic Movie Recommendation
### linear SVM

In [8]:
# Fit SVC with default hyperparamters
svc = svm.SVC(kernel="linear", max_iter = 20)
svc.fit(X_train, y_train)
Y_preds_train = svc.predict(X_train)
Y_preds_test  = svc.predict(X_test)


# Define hyperparamters
paramGrid = {
    "C":      [ 10**-4, 10**-3, 10**-2, 10**-1] # Regularization term
}

#gridsearch
gridSvc = GridSearchCV(svc, paramGrid, cv=5) 
gridSvc.fit(X_train, y_train) 

print("Best parameter of SVC:"+ str(gridSvc.best_params_))

#get best estimator
svcBest = gridSvc.best_estimator_ 

#get scores
scoreTrain = svcBest.score(X_train, y_train) 
scoreTest  = svcBest.score(X_test, y_test) 

print("Tain accuracy: " + str(scoreTrain))
print("Test accuarcy: " +str(scoreTest))




Best parameter of SVC:{'C': 0.1}
Tain accuracy: 0.2497076925857493
Test accuarcy: 0.2538488576449912


### multi-layer perceptron classier

In [None]:
mlp = MLPClassifier(max_iter = 20)

paramGrid = {
        "hidden_layer_sizes":      [ (10,2),(10,1),(10,3),(50,1),(50,2)], # Regularization term
    }
    
gridMlp = GridSearchCV(mlp, paramGrid, cv=5, n_jobs=-1) 
gridMlp.fit(X_train, y_train) 

print(f"Best parameters of MLP:", gridMlp.best_params_)

mlpBest = gridMlp.best_estimator_ # Extract best model

scoreTrain = mlpBest.score(X_train, y_train) 
scoreTest  = mlpBest.score(X_test, y_test) 


print("Tain accuracy: " + str(scoreTrain))
print("Test accuarcy: " +str(scoreTest))

For SVM Model with the Regularization term C = {0.0001, 0.001, 0.01, 0.1} where tried and for die multi-layer-perceptron networks with the hidden layer shapes [ (10,2),(10,1),(10,3),(50,1),(50,2)] where trained.

In [None]:
#count the occurence of each possible Value of y 
df = pd.DataFrame(y_test, columns = ['y'])
occur = df.groupby(['y']).size()
# display occurrences of a particular column
print("Appearence of Labels in Test Dataset: ")
display(occur)
best = 100/y_test.shape[0]*occur[4] 
print("Test performance that can be achieved using a constant prediction (in this case 4) is: "+str(best))


### Subtask 3: Classifer Evaluation I

In [None]:
#Confusion Matrix:

def confusionMtx(yPredict, yTrue):
    #create empty confusion Mtx
    res = np.zeros([5,5] )
    # Labes of 1-5 come in quit handy to define confusion Matrix by Indices. Add +1 for every Datapoint
    for i in range(yPredict.shape[0]):
        res[int(yTrue[i])-1, int(yPredict[i])-1] += 1
    return res

def reconstrucAccOfConfusionMtx(cMtx, dataLen):
    rightClassified = 0
    #summ over the diagonal elements
    for i in range(cMtx.shape[0]):
        rightClassified += cMtx[i,i]
        #return acc
    return 100/dataLen * rightClassified/100

print("SVC confusion Matrix:")
print(confusionMtx(svcBest.predict(X_test), y_test))
print("MLP confusion Matrix:")
print(confusionMtx(mlpBest.predict(X_test), y_test))

print("reconstrucet accuaciy of SVC confusion Matrix:")
print(reconstrucAccOfConfusionMtx(confusionMtx(svcBest.predict(X_test), y_test), y_test.shape[0]))
print("reconstrucet accuaciy of MLP confusion Matrix:")
print(reconstrucAccOfConfusionMtx(confusionMtx(mlpBest.predict(X_test), y_test), y_test.shape[0] ))

In the obtain confusion Matrix the structure becomes clear that the SVC classifier predicts ratings to be to hight and the MLP Matrix predicts Values to be to low. This probably relates to the low amount of learning iterations. However, doing more learning iterations would extend the runtime to high. If you have any hint for me on how I could decrease my training Runtime I would be very thankful.