## 1.

In [2]:
import pandas as pd
from ast import literal_eval
from sklearn.model_selection import train_test_split

# read from file
data_frame = pd.read_csv("../ProjectResources/movies/movies_metadata.csv", low_memory=False)

#create working copy
work_copy = data_frame.copy()
work_copy['budget'] = pd.to_numeric(work_copy['budget'], errors='coerce')

# clean the data
work_copy.dropna(axis='index', how='any', subset=['budget','vote_average', 'revenue', 'genres'], inplace=True)
work_copy = work_copy[['budget','genres', 'vote_average', 'revenue']]
# drop data with bad values
work_copy = work_copy.drop(work_copy[work_copy.revenue == 0].index)

# oject under genres can have multiple values, expand the dataset by creating multiple rows for the
# same movie but with a single genre. These single genre items will be in {id: #, name: "genre_name"} format
work_copy['genres'] = work_copy['genres'].apply(literal_eval)
work_copy = work_copy.explode("genres")
work_copy.dropna(axis='index', how='any', subset=['genres'], inplace=True)
work_copy['genres'] = [d.get('name') for d in work_copy['genres']]

# create training and test sets
train_set, test_set = train_test_split(work_copy, test_size=0.2, random_state=777)

## 2.
Features chosen for x are revenue and vote_average. Feature chosen for y is genre. Genre is a clear categorical option to chose from, although the data inside of this feature might be hard to pull from because it's in the form of an object. I chose the x's because I think they will correlate well to genre.
Edit: cleaned the objects into single strings.

## 3.

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
X = train_set[["revenue", 'vote_average']]
y = train_set['genres']

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,y)

In [11]:
from sklearn.metrics import confusion_matrix
y_predicted = tree_classifier.predict(X)
matrix = confusion_matrix(y, y_predicted)
print(matrix)

[[1371    0    1    1    0    0    4    0    0    0    0    0    0    0
     1    0    0    0    0    0]
 [ 394  470    0    0    0    0    3    1    0    0    0    0    0    0
     0    0    0    0    0    0]
 [  45   96  183    0    0    0    2    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 264  123   63 1640    0    0    7    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 270   21    2  114  450    0    3    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [   3    3    0    8    5  162    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 343  105   30  530  282   10 1642    0    1    0    0    0    0    0
     0    0    0    0    0    0]
 [  44  138  106  121    3    5   41   71    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 122   95   46   89    4    0   62   16   62    0    0    0    0    0
     0    0    0    0    0    0]
 [  10    1    0   17    1    6   21    0    1   11    

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print ("Accuracy is ", accuracy_score(y, y_predicted))

# We have to specify how to combine for the multiclassifications
print ("Precision is ", precision_score(y, y_predicted, average="weighted"))
print ("Sensitivity is ", recall_score(y, y_predicted, average="weighted"))
print ("F1 is ", f1_score(y, y_predicted, average="weighted"))

Accuracy is  0.4683698296836983


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision is  0.5935301231494724
Sensitivity is  0.4683698296836983
F1 is  0.4168616121682307


## 4.
Results are higher than expected, giving the model about a 50% accuracy to be able to guess a genre based on it's revenue and vote_average.
The predictive power measured in F1 score show the model to have around 42%.


## 5.

In [27]:
from sklearn.svm import SVC
X = train_set[["revenue", 'vote_average']]
y = train_set['genres']

svc_classifier = SVC()
svc_classifier.fit(X,y)

In [28]:
predicted = svc_classifier.predict(X)

In [29]:
from sklearn import metrics
print(
    f"Classification report for classifier {svc_classifier}:\n"
    f"{metrics.classification_report(y, predicted)}\n"
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for classifier SVC():
                 precision    recall  f1-score   support

         Action       0.15      0.05      0.07      1378
      Adventure       0.21      0.15      0.17       868
      Animation       0.00      0.00      0.00       326
         Comedy       0.15      0.05      0.07      2097
          Crime       0.00      0.00      0.00       860
    Documentary       0.00      0.00      0.00       181
          Drama       0.21      0.95      0.35      2943
         Family       0.00      0.00      0.00       529
        Fantasy       0.00      0.00      0.00       496
        Foreign       0.00      0.00      0.00        68
        History       0.00      0.00      0.00       238
         Horror       0.00      0.00      0.00       593
          Music       0.00      0.00      0.00       205
        Mystery       0.00      0.00      0.00       455
        Romance       0.00      0.00      0.00      1164
Science Fiction       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
print ("Accuracy is ", accuracy_score(y, predicted))

# We have to specify how to combine for the multiclassifications
print ("Precision is ", precision_score(y, predicted, average="weighted"))
print ("Sensitivity is ", recall_score(y, predicted, average="weighted"))
print ("F1 is ", f1_score(y, predicted, average="weighted"))

Accuracy is  0.20823195458231955
Precision is  0.12351111706707471


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Sensitivity is  0.20823195458231955
F1 is  0.09610103580320917


<!-- ## 6.
The svc default chosen and used is rbf, with is stated to be useful for non-linear and multidimentional data. Despited that it shows a low accuracy and an extremely low F1-score. Significanly worse than the decision tree model. -->