# Task for Today  

***

## Movie Preference Prediction  

Given *data about someone's personality*, let's try to classify **how strongly he/she will enjoy watching movies on a recommendation list**.  
  
We will use logistic regression, support vector machine, and neural network models to make our predictions.

# Getting Started

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
data = pd.read_csv('../input/top-personality-dataset/2018-personality-data.csv')

In [3]:
data

Unnamed: 0,userid,openness,agreeableness,emotional_stability,conscientiousness,extraversion,assigned metric,assigned condition,movie_1,predicted_rating_1,...,movie_9,predicted_rating_9,movie_10,predicted_rating_10,movie_11,predicted_rating_11,movie_12,predicted_rating_12,is_personalized,enjoy_watching
0,8e7cebf9a234c064b75016249f2ac65e,5.0,2.0,3.0,2.5,6.5,serendipity,high,77658,4.410466,...,120138,4.244817,121372,4.396004,127152,4.120456,95311,4.053847,4,4
1,77c7d756a093150d4377720abeaeef76,7.0,4.0,6.0,5.5,4.0,all,default,94959,4.207280,...,56782,4.019599,5618,3.963953,969,4.174188,1232,4.334877,2,3
2,b7e8a92987a530cc368719a0e60e26a3,4.0,3.0,4.5,2.0,2.5,serendipity,medium,110501,4.868064,...,2288,4.823212,3307,4.676756,1172,4.649281,1212,4.744990,2,2
3,92561f21446e017dd6b68b94b23ad5b7,5.5,5.5,4.0,4.5,4.0,popularity,medium,2905,4.526371,...,3030,4.425689,1281,4.479921,940,4.355061,905,4.317927,3,3
4,030001ac2145a938b07e686a35a2d638,5.5,5.5,3.5,4.5,2.5,popularity,medium,2905,4.526371,...,3030,4.425689,1281,4.479921,940,4.355061,905,4.317927,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1829,cff910b71f09b3120289ff6b461a9e03,5.5,3.5,2.5,4.0,5.5,popularity,low,108979,4.246346,...,6643,4.409108,115122,3.960470,7700,4.178546,67997,4.085300,3,3
1830,1ab3a4c2921d8da640854819b0f6cfce,4.0,3.5,4.5,4.0,2.5,serendipity,high,93040,4.227140,...,5618,4.149697,903,4.116152,38061,4.155210,1197,4.045751,3,4
1831,a06386edadf3bc614dadb7044708c46c,6.0,3.0,5.5,3.5,6.0,serendipity,low,106173,3.935297,...,26519,3.998642,89707,4.144870,2571,3.860041,108709,3.899857,3,4
1832,bad56d9506832cd79d874a6b66b3d813,5.0,3.5,1.5,3.5,2.5,serendipity,medium,6874,4.241766,...,92259,4.819710,32,4.059369,3730,4.427336,3435,4.844386,4,4


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1834 entries, 0 to 1833
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   userid                1834 non-null   object 
 1    openness             1834 non-null   float64
 2    agreeableness        1834 non-null   float64
 3    emotional_stability  1834 non-null   float64
 4    conscientiousness    1834 non-null   float64
 5    extraversion         1834 non-null   float64
 6    assigned metric      1834 non-null   object 
 7    assigned condition   1834 non-null   object 
 8    movie_1              1834 non-null   int64  
 9    predicted_rating_1   1834 non-null   float64
 10   movie_2              1834 non-null   int64  
 11   predicted_rating_2   1834 non-null   float64
 12   movie_3              1834 non-null   int64  
 13   predicted_rating_3   1834 non-null   float64
 14   movie_4              1834 non-null   int64  
 15   predicted_rating_4  

# Preprocessing

In [5]:
data.isna().sum()

userid                  0
 openness               0
 agreeableness          0
 emotional_stability    0
 conscientiousness      0
 extraversion           0
 assigned metric        0
 assigned condition     0
 movie_1                0
 predicted_rating_1     0
 movie_2                0
 predicted_rating_2     0
 movie_3                0
 predicted_rating_3     0
 movie_4                0
 predicted_rating_4     0
 movie_5                0
 predicted_rating_5     0
 movie_6                0
 predicted_rating_6     0
 movie_7                0
 predicted_rating_7     0
 movie_8                0
 predicted_rating_8     0
 movie_9                0
 predicted_rating_9     0
 movie_10               0
 predicted_rating_10    0
 movie_11               0
 predicted_rating_11    0
 movie_12               0
 predicted_rating_12    0
 is_personalized        0
 enjoy_watching         0
dtype: int64

In [6]:
data.columns

Index(['userid', ' openness', ' agreeableness', ' emotional_stability',
       ' conscientiousness', ' extraversion', ' assigned metric',
       ' assigned condition', ' movie_1', ' predicted_rating_1', ' movie_2',
       ' predicted_rating_2', ' movie_3', ' predicted_rating_3', ' movie_4',
       ' predicted_rating_4', ' movie_5', ' predicted_rating_5', ' movie_6',
       ' predicted_rating_6', ' movie_7', ' predicted_rating_7', ' movie_8',
       ' predicted_rating_8', ' movie_9', ' predicted_rating_9', ' movie_10',
       ' predicted_rating_10', ' movie_11', ' predicted_rating_11',
       ' movie_12', ' predicted_rating_12', ' is_personalized',
       ' enjoy_watching '],
      dtype='object')

In [7]:
data = data.drop(['userid',
                  ' movie_1', ' predicted_rating_1',
                  ' movie_2', ' predicted_rating_2',
                  ' movie_3', ' predicted_rating_3',
                  ' movie_4', ' predicted_rating_4',
                  ' movie_5', ' predicted_rating_5',
                  ' movie_6', ' predicted_rating_6',
                  ' movie_7', ' predicted_rating_7',
                  ' movie_8', ' predicted_rating_8',
                  ' movie_9', ' predicted_rating_9',
                  ' movie_10', ' predicted_rating_10',
                  ' movie_11', ' predicted_rating_11',
                  ' movie_12', ' predicted_rating_12',
                  ], axis=1)

In [8]:
data

Unnamed: 0,openness,agreeableness,emotional_stability,conscientiousness,extraversion,assigned metric,assigned condition,is_personalized,enjoy_watching
0,5.0,2.0,3.0,2.5,6.5,serendipity,high,4,4
1,7.0,4.0,6.0,5.5,4.0,all,default,2,3
2,4.0,3.0,4.5,2.0,2.5,serendipity,medium,2,2
3,5.5,5.5,4.0,4.5,4.0,popularity,medium,3,3
4,5.5,5.5,3.5,4.5,2.5,popularity,medium,2,3
...,...,...,...,...,...,...,...,...,...
1829,5.5,3.5,2.5,4.0,5.5,popularity,low,3,3
1830,4.0,3.5,4.5,4.0,2.5,serendipity,high,3,4
1831,6.0,3.0,5.5,3.5,6.0,serendipity,low,3,4
1832,5.0,3.5,1.5,3.5,2.5,serendipity,medium,4,4


## Encoding

In [9]:
{column: list(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{' assigned metric': [' serendipity', ' all', ' popularity', ' diversity'],
 ' assigned condition': [' high', ' default', ' medium', ' low']}

In [10]:
data[' assigned condition'].mode()

0     high
dtype: object

In [11]:
condition_ordering = [' low', ' medium', ' default', ' high']

In [12]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [13]:
data = ordinal_encode(data, ' assigned condition', condition_ordering)
data = onehot_encode(data, ' assigned metric', 'm')

In [14]:
data

Unnamed: 0,openness,agreeableness,emotional_stability,conscientiousness,extraversion,assigned condition,is_personalized,enjoy_watching,m_ all,m_ diversity,m_ popularity,m_ serendipity
0,5.0,2.0,3.0,2.5,6.5,3,4,4,0,0,0,1
1,7.0,4.0,6.0,5.5,4.0,2,2,3,1,0,0,0
2,4.0,3.0,4.5,2.0,2.5,1,2,2,0,0,0,1
3,5.5,5.5,4.0,4.5,4.0,1,3,3,0,0,1,0
4,5.5,5.5,3.5,4.5,2.5,1,2,3,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1829,5.5,3.5,2.5,4.0,5.5,0,3,3,0,0,1,0
1830,4.0,3.5,4.5,4.0,2.5,3,3,4,0,0,0,1
1831,6.0,3.0,5.5,3.5,6.0,0,3,4,0,0,0,1
1832,5.0,3.5,1.5,3.5,2.5,1,4,4,0,0,0,1


## Splitting and Scaling

In [15]:
y = data[' enjoy_watching ']
X = data.drop(' enjoy_watching ', axis=1)

In [16]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# Training

In [18]:
log_model = LogisticRegression()
svm_model = SVC(C=1.0)
ann_model = MLPClassifier(hidden_layer_sizes=(16))

log_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
ann_model.fit(X_train, y_train)

log_acc = log_model.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)
ann_acc = ann_model.score(X_test, y_test)



# Results

In [19]:
fig = px.bar(
    x=['Logistic Regression', 'Support Vector Machine', 'Neural Network'],
    y=[log_acc, svm_acc, ann_acc],
    color=['Logistic Regression', 'Support Vector Machine', 'Neural Network'],
    labels={'x': "Model", 'y': "Accuracy"},
    title="Model Accuracy"
)

fig.show()

In [20]:
1/5

0.2

In [21]:
print("Logistic Regression:", log_acc)
print("Support Vector Machine:", svm_acc)
print("Neural Network:", ann_acc)

Logistic Regression: 0.5027223230490018
Support Vector Machine: 0.484573502722323
Neural Network: 0.4827586206896552


# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/SFV-zQMuR0o