In [1]:
import pandas as pd
from ast import literal_eval
from sklearn.model_selection import train_test_split

# read from file
data_frame = pd.read_csv("../ProjectResources/movies/movies_metadata.csv", low_memory=False)

#create working copy
work_copy = data_frame.copy()
work_copy['budget'] = pd.to_numeric(work_copy['budget'], errors='coerce')

# clean the data
work_copy.dropna(axis='index', how='any', subset=['budget','vote_count','vote_average', 'revenue', 'genres'], inplace=True)
work_copy = work_copy[['budget','genres','vote_count', 'vote_average', 'revenue']]
# drop data with bad values
work_copy = work_copy.drop(work_copy[work_copy.revenue == 0].index)
work_copy = work_copy.drop(work_copy[work_copy.budget == 0].index)
work_copy = work_copy.drop(work_copy[work_copy.vote_count == 0].index)

# oject under genres can have multiple values, expand the dataset by creating multiple rows for the
# same movie but with a single genre. These single genre items will be in {id: #, name: "genre_name"} format
work_copy['genres'] = work_copy['genres'].apply(literal_eval)
work_copy = work_copy.explode("genres")
work_copy.dropna(axis='index', how='any', subset=['genres'], inplace=True)
work_copy['genres'] = [d.get('name') for d in work_copy['genres']]

# Encode the genres into integers.
from sklearn.preprocessing import LabelEncoder
work_copy['genre_int'] = LabelEncoder().fit_transform(work_copy['genres'])

# create training and test sets
train_set, test_set = train_test_split(work_copy, test_size=0.2, random_state=27)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

model = Pipeline([
    ('addQuad', PolynomialFeatures(degree=2)),
    ('scale', StandardScaler()),
    ('svm', SVC())
    ]
)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# model_logreg = LogisticRegression(solver="lbfgs")
model_svm = SVC(kernel="rbf")
model_random_forest = RandomForestClassifier(n_estimators=10)
model_decision_tree = DecisionTreeClassifier()

ensemble = VotingClassifier(
    voting= "hard",
    estimators = [
        # ("logreg", model_logreg),
        ("svm", model_svm),
        ("randfor", model_random_forest),
        ("desitree", model_decision_tree),
        ('quad', model)
    ]
)

In [4]:
from sklearn.metrics import accuracy_score, f1_score
X = train_set[["vote_average", "revenue", "budget", "vote_count"]]
y = train_set['genre_int']

ensemble.fit(X, y)
y_pred = ensemble.predict(X)
print("Ensemble has accuracy ", accuracy_score(y, y_pred))
print("Ensemble has f1_score ", f1_score(y, y_pred, average="weighted"))

X_test = test_set[["vote_average", "revenue", "budget", "vote_count"]]
y_test = test_set['genre_int']

ensemble = ensemble = VotingClassifier(
    voting= "hard",
    estimators = [
        # ("logreg", model_logreg),
        ("svm", model_svm),
        ("randfor", model_random_forest),
        ("desitree", model_decision_tree),
        ('quad', model)
    ]
)
ensemble.fit(X_test, y_test)
y_test_pred = ensemble.predict(X_test)
print("Ensemble on test has accuracy ", accuracy_score(y_test, y_test_pred))
print("Ensemble on test has f1_score ", f1_score(y_test, y_test_pred, average="weighted"))

Ensemble has accuracy  0.3460714285714286
Ensemble has f1_score  0.2721182570622817
Ensemble on test has accuracy  0.5392857142857143
Ensemble on test has f1_score  0.49371688364882094


The ensemble built from the Voting classifier performs in the midrange of the models tested overall. 

In [5]:
print("One train set")
for classifier in [model_svm, model_random_forest, model_decision_tree, model]:
    classifier.fit(X, y)
    y_pred = classifier.predict(X)

    print(classifier.__class__.__name__, " has accuracy ", accuracy_score(y, y_pred))
    print(classifier.__class__.__name__, " has f1_score ", f1_score(y, y_pred, average="weighted"))

print()
print("One Test set")
for classifier in [model_svm, model_random_forest, model_decision_tree, model]:
    classifier.fit(X_test, y_test)
    y__test_pred = classifier.predict(X_test)

    print(classifier.__class__.__name__, " on test has accuracy ", accuracy_score(y_test, y_test_pred))
    print(classifier.__class__.__name__, " on test has f1_score ", f1_score(y_test, y_test_pred, average="weighted"))
    
    

One train set
SVC  has accuracy  0.19955357142857144
SVC  has f1_score  0.09515548506853343
RandomForestClassifier  has accuracy  0.4523214285714286
RandomForestClassifier  has f1_score  0.45109121540882713
DecisionTreeClassifier  has accuracy  0.4533035714285714
DecisionTreeClassifier  has f1_score  0.4014888659594106
Pipeline  has accuracy  0.225625
Pipeline  has f1_score  0.1355428957138484

One Test set
SVC  on test has accuracy  0.5392857142857143
SVC  on test has f1_score  0.49371688364882094
RandomForestClassifier  on test has accuracy  0.5392857142857143
RandomForestClassifier  on test has f1_score  0.49371688364882094
DecisionTreeClassifier  on test has accuracy  0.5392857142857143
DecisionTreeClassifier  on test has f1_score  0.49371688364882094
Pipeline  on test has accuracy  0.5392857142857143
Pipeline  on test has f1_score  0.49371688364882094
