## 1.

In [1]:
import pandas as pd
from ast import literal_eval
from sklearn.model_selection import train_test_split

# read from file
data_frame = pd.read_csv("../ProjectResources/movies/movies_metadata.csv", low_memory=False)

#create working copy
work_copy = data_frame.copy()
work_copy['budget'] = pd.to_numeric(work_copy['budget'], errors='coerce')

# clean the data
work_copy.dropna(axis='index', how='any', subset=['budget','vote_count','vote_average', 'revenue', 'genres', 'runtime'], inplace=True)
work_copy = work_copy[['budget','genres','vote_count', 'vote_average', 'revenue', "runtime"]]
# drop data with bad values
work_copy = work_copy.drop(work_copy[work_copy.revenue == 0].index)
work_copy = work_copy.drop(work_copy[work_copy.budget == 0].index)
work_copy = work_copy.drop(work_copy[work_copy.vote_count == 0].index)

# oject under genres can have multiple values, expand the dataset by creating multiple rows for the
# same movie but with a single genre. These single genre items will be in {id: #, name: "genre_name"} format
work_copy['genres'] = work_copy['genres'].apply(literal_eval)
work_copy = work_copy.explode("genres")
work_copy.dropna(axis='index', how='any', subset=['genres'], inplace=True)
work_copy['genres'] = [d.get('name') for d in work_copy['genres']]

# Encode the genres into integers so they can be used by the KMeans classifier.
from sklearn.preprocessing import LabelEncoder
work_copy['genre_int'] = LabelEncoder().fit_transform(work_copy['genres'])

# create training and test sets
train_set, test_set = train_test_split(work_copy, test_size=0.2, random_state=27)

## 2.
Features chosen for x are revenue and vote_average. Feature chosen for y is genre. Genre is a clear categorical option to chose from, although the data inside of this feature might be hard to pull from because it's in the form of an object. I chose the x's because I think they will correlate well to genre.
Edit: cleaned the objects into single strings.

## 3.

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
X = train_set[["revenue", 'vote_average']]
y = train_set['genres']

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,y)

In [3]:
from sklearn.metrics import confusion_matrix
y_predicted = tree_classifier.predict(X)
matrix = confusion_matrix(y, y_predicted)
print(matrix)

[[1139    0    0    1    0    0    6    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 354  431    0    1    0    0    2    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [  22   89  114    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 195  117   40 1115    1    0    3    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 258   14    1   88  332    0    2    0    0    0    0    0    0    0
     0    0    0    1    0    0]
 [   2    0    0    2    0   42    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 268   95   19  343  204    4 1148    0    0    0    0    0    0    0
     0    0    0    1    0    0]
 [  47  131   75   92    1    2   18   48    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 104   95   22   66    4    0   43   20   52    0    0    0    0    0
     0    0    0    1    0    0]
 [   5    0    0    8    1    1    8    0    0    3    

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print ("Accuracy is ", accuracy_score(y, y_predicted))

# We have to specify how to combine for the multiclassifications
print ("Precision is ", precision_score(y, y_predicted, average="weighted"))
print ("Sensitivity is ", recall_score(y, y_predicted, average="weighted"))
print ("F1 is ", f1_score(y, y_predicted, average="weighted"))

Accuracy is  0.452629699080275
Precision is  0.5769544629937213


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Sensitivity is  0.452629699080275
F1 is  0.40208521692157645


## 4.
Results are higher than expected, giving the model about a 45% accuracy to be able to guess a genre based on it's revenue and vote_average.
The predictive power measured in F1 score show the model to have around 40%.


## 5.

In [5]:
from sklearn.svm import SVC
X = train_set[["revenue", 'vote_average']]
y = train_set['genres']

svc_classifier = SVC()
svc_classifier.fit(X,y)

In [6]:
predicted = svc_classifier.predict(X)

In [7]:
from sklearn import metrics
print(
    f"Classification report for classifier {svc_classifier}:\n"
    f"{metrics.classification_report(y, predicted)}\n"
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for classifier SVC():
                 precision    recall  f1-score   support

         Action       0.14      0.05      0.08      1146
      Adventure       0.20      0.20      0.20       788
      Animation       0.00      0.00      0.00       225
         Comedy       0.16      0.02      0.04      1471
          Crime       0.00      0.00      0.00       696
    Documentary       0.00      0.00      0.00        46
          Drama       0.20      0.94      0.33      2082
         Family       0.00      0.00      0.00       414
        Fantasy       0.00      0.00      0.00       407
        Foreign       0.00      0.00      0.00        26
        History       0.00      0.00      0.00       191
         Horror       0.00      0.00      0.00       452
          Music       0.00      0.00      0.00       142
        Mystery       0.00      0.00      0.00       352
        Romance       0.00      0.00      0.00       815
Science Fiction       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
print ("Accuracy is ", accuracy_score(y, predicted))

# We have to specify how to combine for the multiclassifications
print ("Precision is ", precision_score(y, predicted, average="weighted"))
print ("Sensitivity is ", recall_score(y, predicted, average="weighted"))
print ("F1 is ", f1_score(y, predicted, average="weighted"))

Accuracy is  0.19698187338155193
Precision is  0.0866157365268207


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Sensitivity is  0.19698187338155193
F1 is  0.08844553949959813


The svc default chosen and used is rbf, with is stated to be useful for non-linear and multidimentional data. Despited that it shows a low accuracy and an extremely low F1-score. Significanly worse than the decision tree model.

## 6.

In [9]:
X = test_set[["revenue", 'vote_average']]
y = test_set['genres']

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X,y)

y_predicted = tree_classifier.predict(X)

print("Decision tree")
print ("Accuracy is ", accuracy_score(y, y_predicted))

# We have to specify how to combine for the multiclassifications
print ("Precision is ", precision_score(y, y_predicted, average="weighted"))
print ("Sensitivity is ", recall_score(y, y_predicted, average="weighted"))
print ("F1 is ", f1_score(y, y_predicted, average="weighted"))
print()
svc_classifier = SVC()
svc_classifier.fit(X,y)

predicted = svc_classifier.predict(X)

print("svc rbf")
print ("Accuracy is ", accuracy_score(y, predicted))

# We have to specify how to combine for the multiclassifications
print ("Precision is ", precision_score(y, predicted, average="weighted"))
print ("Sensitivity is ", recall_score(y, predicted, average="weighted"))
print ("F1 is ", f1_score(y, predicted, average="weighted"))

Decision tree
Accuracy is  0.8071428571428572
Precision is  0.8337005161801178
Sensitivity is  0.8071428571428572
F1 is  0.8053953555607325

svc rbf
Accuracy is  0.20035714285714284
Precision is  0.13487869903459537
Sensitivity is  0.20035714285714284
F1 is  0.10093731934976077


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 7.
The predictive power of revenue and vote_average to predict genre gave surprising results. The svc model gave consistent and poor results, notibly an F1 score that is close to zero for the both the training and test set. The Decision tree performed somewhat poorly with an accuracy of 45% on the training set and a 40% F1-score. Which seems high. For the test set the accuracy and F1 score both rose all the way to 81%.