# Predikcia zárobku filmov
## Zdroj dát: [Kaggle](https://www.kaggle.com/stephanerappeneau/350-000-movies-from-themoviedborg) (ThemovieDB)

## TODOs
* Nahradit unknown budget values medianom/priemerom
* Normalizovat hodnoty na skale <0, 1> - pre ktore stlpce?
* Binarizacia kategorickych stlpcov
* Pridat do LR vsetky stplce (fixnut errory s hodnotami)
* Confusion metrics (matrix?)

## Odovzdanie
* Rozsah 3-5 stran
* Formatovanie
* Spomenut ine preace, clanky alebo blogy - ako bol uz problem rieseny
* Opisovat predspracovanie a outlierov
* Slepe ulicky, neuspesne verzie
* Opis pouzitych metod a prvotne experimenty
* Scenar pouzitia
* Metriky
    * Classification report
    * Accuracy je nepodstatna
    * Coefficient of determination (R2)
* Dokopy 6 kapitol + posledna kapitola zoznam pouzitej literatury


In [10]:
import pandas as pd
import numpy as np
import matplotlib.mlab as mlab
import traceback
import re

from matplotlib import pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import shuffle
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder
    
    
# Loads the dataset CSV file from a given path
def load_dataset(filepath, data_delimiter):
    # Load the file to a dataframe via Pandas
    try:
        df = pd.read_csv(filepath, delimiter=data_delimiter, low_memory=False)
    
        print("Successfully loaded the dataset from " + filepath)
        print("Rows: " + str(df.shape[0]) + ", columns: " + str(df.shape[1]))
        
        return df
    except:
        traceback.print_exc()
        print("Failed to load the dataset from " + filepath)
        return
   
   
def filter_by_bottom_threshold(df, colname, threshold, equal=False):
    print("Dropping '{}' values below the following threshold: {}".format(colname, threshold))
    
    rows_before = df.shape[0]
    
    if equal:
        df = df[df[colname] >= threshold]
    else:
        df = df[df[colname] > threshold]
    
    print("Dropped {} entries".format(rows_before - df.shape[0]))
    return df


def filter_by_upper_threshold(df, colname, threshold, equal=False):
    print("Dropping '{}' values above the following threshold: {}".format(colname, threshold))
    
    rows_before = df.shape[0]
    
    if equal:
        df = df[df[colname] <= threshold]
    else:
        df = df[df[colname] < threshold]
    
    print("Dropped {} entries".format(rows_before - df.shape[0]))
    return df
    
    
def drop_nan_values(df, colnames, keep_zeros=True):
    df = df.dropna(axis=0, how='any', subset=colnames)
    rows_before = df.shape[0]
    
    if not keep_zeros:
        for col in colnames:
            df = df[df[col] > 0]
        
    print("Dropped {} zero/NaN entries".format(rows_before - df.shape[0]))
    return df
    

def drop_outliers_iqb(df, colnames):
    for col in colnames:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        df = df.query('(@Q1 - 1.5 * @IQR) <= ' + col + ' <= (@Q3 + 1.5 * @IQR)')
    return df


def show_user_ratings_histogram(df):
    df['vote_average'].plot.hist(alpha=0.5, bins=10, range=(0, 10), normed=True)
    plt.xlabel("Vote_average")
    plt.ylabel("Percentage")
    plt.show()
    
    
def show_scatter_plot(df, col1, col2):
    plt.scatter(df[col1], df[col2], color='navy', alpha=0.5)
    plt.xlabel(col1.capitalize())
    plt.ylabel(col2.capitalize())
    plt.show()
    

def show_scatter_matrices_plot(df):
    # Scatter matrices plot
    sm = scatter_matrix(df, alpha=0.3, figsize=(12, 12), diagonal='kde', grid=True, edgecolors='blue')

    #Change label rotation
    for s in sm.reshape(-1):
        s.xaxis.label.set_size(8)
        s.xaxis.label.set_rotation(45)

        s.yaxis.label.set_size(8)
        s.yaxis.label.set_rotation(45) 

    #May need to offset label when rotating to prevent overlap of figure
    [s.get_yaxis().set_label_coords(-0.5, 0.5) for s in sm.reshape(-1)]

    #Hide all ticks
    [s.set_xticks(()) for s in sm.reshape(-1)]
    [s.set_yticks(()) for s in sm.reshape(-1)]
    
    plt.savefig("scatter_matrix.png", dpi=300)
    plt.show()
    
def binarize_categorical_feature(df, colname):
    categories = []
    
    #fix nan
    df[colname] = df[colname].fillna('')
    
    #Get all categories with row index in separet rows
    for i in range(len(df[colname])):
        for element in re.split("\|", df[colname][i]):
            categories.append([i, element])
    
    #binarize_feature
    feature_with_num_label = pd.DataFrame.from_records(categories, columns=['number_label', 'category'])
    binary_feature = feature_with_num_label.pivot_table(
        index=['number_label'], columns=['category'], aggfunc=[len], fill_value=0)
    
    #use string indexes for columns in array instead of tuples as (len, category)
    binary_feature.columns = [column[1] for column in binary_feature.columns]
    
    #add binarized_feature columns to dataframe without fist (len) column
    df = pd.concat([binary_feature.iloc[:, 1:], df], axis=1)
    
    #remove old column from dataframe
    df = df.drop(colname, axis=1)
    
    return df

def get_polynomial_features(df, deg = 2):
    poly_features = PolynomialFeatures(degree = deg)
    poly_features = poly_features.fit(df)
    columns_names  = poly_features.get_feature_names(df.columns)
    df = poly_features.transform(df)
    
    return pd.DataFrame(df, columns = columns_names)

def encode_strings(df, columns):
    labelencoder = LabelEncoder()
    for column in columns:
        df[column] = df[column].fillna('')
        df[column] = labelencoder.fit_transform(df[column])
        
    return df
    
    
def separate_dates(df):
    from pandas import DatetimeIndex
    
    print('Splitting dates into d/m/y columns')
    df['release_year'] = pd.to_numeric(DatetimeIndex(df['release_date']).year, downcast='integer')
    df['release_month'] = pd.to_numeric(DatetimeIndex(df['release_date']).month, downcast='integer')
    df['release_day'] = pd.to_numeric(DatetimeIndex(df['release_date']).day, downcast='integer')
    
    df.to_csv('data/themoviedb/moviesKnownRevenue.csv', sep=';')
    print ('Date splitting complete')
    
def show_yearly_revenue(df):
    # Only rows with known release year
    df = df[df['release_year'] > 0]
 
    grouped_df = df.groupby('release_year')
    
    plt.plot(grouped_df['revenue'].mean(), color='navy') 
    plt.xlabel('Year')
    plt.ylabel('AVG movie revenue')

    plt.show()
    return

    
def get_simple_lr_model(data, target):
    data = data.values.reshape(data.shape[0], data.shape[1])
    target = target.values.reshape(target.shape[0], 1)
    
    # data = preprocessing.scale(data)
    
    train_size=7000
    test_size=1500
    
    x_train = data[:train_size]
    x_test = data[train_size:train_size + test_size]
    
    y_train = target[:train_size]
    y_test = target[train_size:train_size + test_size]

    lm = linear_model.LinearRegression()
    model = lm.fit(x_train, y_train)

    y_pred = lm.predict(x_test)
    
    # The coefficients
    print('Coefficients:', lm.coef_)
    # The mean squared error
    print("Mean squared error: %g" % mean_squared_error(y_test, y_pred))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.5f' % r2_score(y_test, y_pred))

    # Plot outputs
    plt.scatter(y_pred, y_test, linewidth=1, alpha=0.75, color=['navy', 'navy'])
    
    plt.xlabel('Predicted values')
    plt.ylabel('Real values')

    plt.xticks(())
    plt.yticks(())

    # plt.show()
    return (r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred))
    
    
df = load_dataset('data/themoviedb/moviesKnownRevenue.csv', ';')

# Drop zero/NaN value rows
# df = drop_nan_values(df, ['revenue'], False)

# Drop unnecessary columns
# df = df.drop(['id'], axis=1)
df = df.fillna(df.median())
df['popularity'] = df['popularity'].replace(',', '.', regex=True)
# df[['vote_average', 'vote_count']] = df[['vote_average', 'vote_count']].replace([0, 0], [df['vote_average'].mean(), df['vote_count'].mean()])

df = encode_strings(df, ['original_language', 'production_countries', 'spoken_languages'])

#binarize genres
#pd.set_option('display.max_columns', 50)
df = binarize_categorical_feature(df, 'genres')
#print ("Dataframe:")
#display(df)

poly_df = get_polynomial_features(df[['production_companies_number', 'budget', 'vote_average', 'vote_count', 'original_language']], 2)

##############################

# User ratings histogram
# show_user_ratings_histogram(df)

# Scatter plot of revenue & budget
# show_scatter_plot(df, 'budget', 'revenue')

# Scatter matrices
# show_scatter_matrices_plot(df)

# Yearly revenue bar chart
# show_yearly_revenue(df)

# A basic linear regression model
avg_score, avg_mse, N = 0, 0, 100

for i in range(0, N):
    df = shuffle(df) if N > 1 else df
    results = get_simple_lr_model(df[[
        'budget', 'vote_count', 'popularity', 'runtime', 
        'production_companies_number', 'production_countries_number', 'spoken_languages_number',
        'release_year']], 
        df['revenue'])
    avg_score += results[0]
    avg_mse += results[1]
    
avg_score /= N
avg_mse /= N

print('**** AVG SCORE: {} ****'.format(avg_score))
print('**** AVG MSE: {} ****'.format(avg_mse))

Successfully loaded the dataset from data/themoviedb/moviesKnownRevenue.csv
Rows: 8584, columns: 25
('Coefficients:', array([[  1.77691314e+00,   7.25005780e+04,   8.72881363e+05,
         -1.97231581e+03,  -3.77200274e+06,  -4.19539639e+06,
          7.88472839e+05,  -2.28755553e+05]]))
Mean squared error: 5.03485e+15
Variance score: 0.72308
('Coefficients:', array([[  1.77265957e+00,   7.24923350e+04,   2.36977302e+06,
          1.16233389e+04,  -3.91977016e+06,  -4.38977688e+06,
          2.64682438e+05,  -2.57515909e+05]]))
Mean squared error: 3.66108e+15
Variance score: 0.73996
('Coefficients:', array([[  1.71108570e+00,   7.30753037e+04,   3.05481525e+06,
         -1.40812365e+04,  -4.05388807e+06,  -3.16106048e+06,
         -9.27974732e+04,  -2.66352760e+05]]))
Mean squared error: 4.96606e+15
Variance score: 0.69536
('Coefficients:', array([[  1.69117974e+00,   7.51306784e+04,   1.81864051e+06,
          9.03607619e+03,  -3.81726385e+06,  -3.88621013e+06,
         -4.65736322e+0

('Coefficients:', array([[  1.69412323e+00,   7.17720068e+04,   2.80559835e+06,
         -2.12164357e+04,  -3.28797370e+06,  -4.00017066e+06,
          1.49057109e+05,  -2.49683343e+05]]))
Mean squared error: 6.19043e+15
Variance score: 0.73011
('Coefficients:', array([[  1.71633207e+00,   7.60066424e+04,  -4.81054643e+05,
         -9.62199250e+03,  -3.63568607e+06,  -2.35350156e+06,
          7.90729862e+05,  -2.55987374e+05]]))
Mean squared error: 7.88382e+15
Variance score: 0.70575
('Coefficients:', array([[  1.73581299e+00,   6.98435530e+04,   1.89431184e+06,
         -2.01087239e+03,  -3.78189387e+06,  -3.51411938e+06,
          3.64834554e+04,  -2.36241842e+05]]))
Mean squared error: 4.35814e+15
Variance score: 0.75794
('Coefficients:', array([[  1.63492110e+00,   7.20072950e+04,   2.68198412e+06,
         -1.24268641e+04,  -4.04193022e+06,  -2.10564624e+06,
          6.88745245e+05,  -2.64222956e+05]]))
Mean squared error: 5.14206e+15
Variance score: 0.77024
('Coefficients:', ar

('Coefficients:', array([[  1.72512102e+00,   6.65789704e+04,   3.21719060e+06,
         -1.80816660e+02,  -3.51640374e+06,  -3.21420478e+06,
         -1.22334227e+06,  -2.09635159e+05]]))
Mean squared error: 7.01499e+15
Variance score: 0.70813
('Coefficients:', array([[  1.69139815e+00,   7.52687683e+04,   1.81649144e+06,
         -2.13397448e+04,  -4.07599723e+06,  -2.78842111e+06,
          2.71459459e+05,  -2.27282208e+05]]))
Mean squared error: 4.25376e+15
Variance score: 0.75398
('Coefficients:', array([[  1.72412731e+00,   7.04654830e+04,   2.54794056e+06,
         -3.96229137e+03,  -4.19284820e+06,  -3.25296945e+06,
         -4.43933351e+04,  -2.22834514e+05]]))
Mean squared error: 6.4537e+15
Variance score: 0.68668
('Coefficients:', array([[  1.71202396e+00,   7.26311625e+04,   2.94879281e+06,
          1.41548923e+03,  -4.10292017e+06,  -3.49477212e+06,
         -1.44394235e+05,  -2.37602445e+05]]))
Mean squared error: 4.16962e+15
Variance score: 0.75943
('Coefficients:', arr