# Rating prediction

In [127]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
import duckdb

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import preprocessing

In [128]:
# create a connection to a file called 'imdb_reviews.db'
con = duckdb.connect('imdb_reviews.db')

In [129]:
con.execute("DESCRIBE").fetchdf()

Unnamed: 0,table_name,column_names,column_types,temporary
0,budget_movie_data,"[release_date, movie, production_budget, domes...","[TIMESTAMP, VARCHAR, VARCHAR, VARCHAR, VARCHAR...",False
1,dummie_director,"[movie, director_1_dum, director_2_dum, direct...","[VARCHAR, UTINYINT, UTINYINT, UTINYINT, UTINYI...",False
2,dummie_director_all,"[movie, director_nm0000005, director_nm0000008...","[VARCHAR, UTINYINT, UTINYINT, UTINYINT, UTINYI...",False
3,dummie_director_top_5,"[movie, director_1_dum, director_2_dum, direct...","[VARCHAR, UTINYINT, UTINYINT, UTINYINT, UTINYI...",False
4,dummie_writer,"[movie, writer_1_dum, writer_2_dum, writer_3_d...","[VARCHAR, UTINYINT, UTINYINT, UTINYINT, UTINYI...",False
5,dummie_writer_all,"[movie, writer_nm0000005, writer_nm0000019, wr...","[VARCHAR, UTINYINT, UTINYINT, UTINYINT, UTINYI...",False
6,dummie_writer_top_5,"[movie, writer_1_dum, writer_2_dum, writer_3_d...","[VARCHAR, UTINYINT, UTINYINT, UTINYINT, UTINYI...",False
7,imdb_review_test,"[tconst, primaryTitle, originalTitle, startYea...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
8,imdb_review_train,"[tconst, primaryTitle, originalTitle, startYea...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
9,imdb_review_validation,"[tconst, primaryTitle, originalTitle, startYea...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False


# Loading train, validation and test set

In [130]:
# Load train data set
df_train = con.execute("""SELECT imdb_review_train.tconst,
                            imdb_review_train.startYear,
                            imdb_review_train.primaryTitle,
 
                           imdb_review_train.runtimeMinutes,
                           imdb_review_train.numVotes,
                           
                           oscars.year_film,
                           oscars.film,
                           oscars.name,
                           oscars.winner,
                           
                           dummie_writer_all.*,
                           dummie_director_all.*,
                           
                           budget_movie_data.production_budget,
                           budget_movie_data.domestic_gross,
                           budget_movie_data.worldwide_gross,
                           
                           imdb_review_train.label
                           
                    FROM imdb_review_train
                    
                    -- Merge the oscar information
                    LEFT JOIN oscars
                    ON imdb_review_train.startYear = oscars.year_film 
                    AND imdb_review_train.primaryTitle = oscars.film
                    
                    -- Merge the writer information
                    LEFT JOIN dummie_writer_all
                    ON imdb_review_train.tconst = dummie_writer_all.movie  
                    
                    -- Merge the director information
                    LEFT JOIN dummie_director_all
                    ON imdb_review_train.tconst = dummie_director_all.movie  
                    
                    -- Merge the budget information
                    LEFT JOIN budget_movie_data
                    ON imdb_review_train.startYear = budget_movie_data.release_year 
                    AND imdb_review_train.primaryTitle = budget_movie_data.movie
                    
                    """).fetchdf()
df_train

Unnamed: 0,tconst,startYear,primaryTitle,runtimeMinutes,numVotes,year_film,film,name,winner,movie,...,director_nm9825418,director_nm9902054,director_nm9942830,director_nm9955258,director_nm9958352,director_nm9985316,production_budget,domestic_gross,worldwide_gross,label
0,tt0056217,1962,the man who shot liberty valance,123,73331.0,1962.0,the man who shot liberty valance,edith head,False,tt0056217,...,0.0,0.0,0.0,0.0,0.0,0.0,"$3,200,000","$8,000,000","$8,000,000",True
1,tt0057163,1963,hud,112,21102.0,1963.0,hud,paul newman,False,tt0057163,...,0.0,0.0,0.0,0.0,0.0,0.0,"$2,500,000","$10,000,000","$10,000,000",True
2,tt0070735,1973,the sting,129,,1973.0,the sting,robert redford,False,tt0070735,...,0.0,0.0,0.0,0.0,0.0,0.0,"$5,500,000","$159,616,327","$159,616,327",True
3,tt0087892,1984,a passage to india,164,,1984.0,a passage to india,judy davis,False,tt0087892,...,0.0,0.0,0.0,0.0,0.0,0.0,"$27,500,000","$27,187,653","$27,187,896",True
4,tt0108551,1993,whats love got to do with it,118,20131.0,1993.0,whats love got to do with it,laurence fishburne,False,tt0108551,...,0.0,0.0,0.0,0.0,0.0,0.0,"$15,000,000","$39,100,956","$39,100,956",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7942,tt0086907,1984,antonio gaudi,72,1557.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,True
7943,tt0117293,1996,paradise lost the child murders at robin hood ...,150,16296.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,True
7944,tt0847817,2007,helvetica,80,7786.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,True
7945,tt4881578,2020,zappa,129,2561.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,True


In [135]:
# Load test data set
df_test = con.execute("""SELECT imdb_review_test.tconst,
                            imdb_review_test.startYear,
                            imdb_review_test.primaryTitle,
 
                           imdb_review_test.runtimeMinutes,
                           imdb_review_test.numVotes,
                           
                           oscars.year_film,
                           oscars.film,
                           oscars.name,
                           oscars.winner,
                           
                           dummie_writer_all.*,
                           dummie_director_all.*,
                           
                           budget_movie_data.production_budget,
                           budget_movie_data.domestic_gross,
                           budget_movie_data.worldwide_gross
                           
                    FROM imdb_review_test
                    
                    -- Merge the oscar information
                    LEFT JOIN oscars
                    ON imdb_review_test.startYear = oscars.year_film 
                    AND imdb_review_test.primaryTitle = oscars.film
                    
                    -- Merge the writer information
                    LEFT JOIN dummie_writer_all
                    ON imdb_review_test.tconst = dummie_writer_all.movie  
                    
                    -- Merge the director information
                    LEFT JOIN dummie_director_all
                    ON imdb_review_test.tconst = dummie_director_all.movie  
                    
                    -- Merge the budget information
                    LEFT JOIN budget_movie_data
                    ON imdb_review_test.startYear = budget_movie_data.release_year 
                    AND imdb_review_test.primaryTitle = budget_movie_data.movie
                    
                    """).fetchdf()
df_test

Unnamed: 0,tconst,startYear,primaryTitle,runtimeMinutes,numVotes,year_film,film,name,winner,movie,...,director_nm9818807,director_nm9825418,director_nm9902054,director_nm9942830,director_nm9955258,director_nm9958352,director_nm9985316,production_budget,domestic_gross,worldwide_gross
0,tt0038650,1946,its a wonderful life,130,427625.0,1946.0,its a wonderful life,james stewart,False,tt0038650,...,0,0,0,0,0,0,0,"$3,180,000","$12,241,691","$16,420,592"
1,tt0021730,1931,the champ,86,3057.0,1931.0,the champ,wallace beery,True,tt0021730,...,0,0,0,0,0,0,0,,,
2,tt0028356,1936,these three,93,2627.0,1936.0,these three,bonita granville,False,tt0028356,...,0,0,0,0,0,0,0,,,
3,tt0027657,1936,the garden of allah,79,1519.0,1936.0,the garden of allah,eric g stacey,False,tt0027657,...,0,0,0,0,0,0,0,,,
4,tt0031619,1939,the man in the iron mask,113,1662.0,1939.0,the man in the iron mask,lud gluskin lucien moraweck,False,tt0031619,...,0,0,0,0,0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,tt8991264,2019,hớaxed,125,1302.0,,,,,,...,0,0,0,0,0,0,0,,,
1082,tt9008642,2017,notuku potu,121,4423.0,,,,,,...,0,0,0,0,0,0,0,,,
1083,tt0135641,1996,sapoot,158,1107.0,,,,,,...,0,0,0,0,0,0,0,,,
1084,tt0256408,2001,startupcom,107,3386.0,,,,,,...,0,0,0,0,0,0,0,,,


In [136]:
# Load validation data set
df_validation = con.execute("""SELECT imdb_review_validation.tconst,
                            imdb_review_validation.startYear,
                            imdb_review_validation.primaryTitle,
 
                           imdb_review_validation.runtimeMinutes,
                           imdb_review_validation.numVotes,
                           
                           oscars.year_film,
                           oscars.film,
                           oscars.name,
                           oscars.winner,
                           
                           dummie_writer_all.*,
                           dummie_director_all.*,
                           
                           budget_movie_data.production_budget,
                           budget_movie_data.domestic_gross,
                           budget_movie_data.worldwide_gross
                           
                    FROM imdb_review_validation
                    
                    -- Merge the oscar information
                    LEFT JOIN oscars
                    ON imdb_review_validation.startYear = oscars.year_film 
                    AND imdb_review_validation.primaryTitle = oscars.film
                    
                    -- Merge the writer information
                    LEFT JOIN dummie_writer_all
                    ON imdb_review_validation.tconst = dummie_writer_all.movie  
                    
                    -- Merge the director information
                    LEFT JOIN dummie_director_all
                    ON imdb_review_validation.tconst = dummie_director_all.movie  
                    
                    -- Merge the budget information
                    LEFT JOIN budget_movie_data
                    ON imdb_review_validation.startYear = budget_movie_data.release_year 
                    AND imdb_review_validation.primaryTitle = budget_movie_data.movie
                    
                    """).fetchdf()
df_validation

Unnamed: 0,tconst,startYear,primaryTitle,runtimeMinutes,numVotes,year_film,film,name,winner,movie,...,director_nm9818807,director_nm9825418,director_nm9902054,director_nm9942830,director_nm9955258,director_nm9958352,director_nm9985316,production_budget,domestic_gross,worldwide_gross
0,tt0054331,1960,spartacus,197,130879.0,1960.0,spartacus,peter ustinov,True,tt0054331,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"$12,000,000","$30,000,000","$60,000,000"
1,tt0026725,1935,les miserables,108,3512.0,1935.0,les miserables,eric stacey,False,tt0026725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,tt0031826,1939,the private lives of elizabeth and essex,106,5507.0,1939.0,the private lives of elizabeth and essex,anton grot,False,tt0031826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,tt0033563,1941,dumbo,64,126856.0,1941.0,dumbo,frank churchill oliver wallace,True,tt0033563,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,tt0036367,1943,so proudly we hail,126,1700.0,1943.0,so proudly we hail,paulette goddard,False,tt0036367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,tt5519566,2016,the first mớnday in may,90,1659.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
951,tt9074318,2019,son rise,49,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
952,tt15085794,2021,untold deal with the devil,77,1856.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
953,tt2079571,2011,roman polanski a film memoir,90,1204.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


## Decision Tree, use XGBoost instead

In [137]:
# from sklearn import preprocessing
# lbl = preprocessing.LabelEncoder()
# df_train['director_1'] = lbl.fit_transform(df_train['director_1'].astype(str))

# # Step 0: Prepare dataset and remove rows with NaN values (not possible with decision tree model)
# df_no_nan = df_train[['runtimeMinutes', 'numVotes','director_1',
#              'label']].dropna()

# # Step 1: Define explanatory and target variables
# X = df_no_nan[['runtimeMinutes', 'numVotes', 'director_1']]
# y = df_no_nan['label']

# # Step 2: Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)

# # Step 3: Normalize the data for numerical stability
# ss_train = StandardScaler()
# X_train = ss_train.fit_transform(X_train)

# ss_test = StandardScaler()
# X_test = ss_test.fit_transform(X_test)

# # Step 4: Fit a decision tree model to the training data
# clf = DecisionTreeClassifier()

# # Step 5: Make predictions on the testing data
# clf = clf.fit(X_train,y_train)

# # Step 6: Calculate the accuracy score by comparing the actual values and predicted values.
# predictions = clf.predict(X_test)

# cm = confusion_matrix(y_test, predictions)

# TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()

# print('True Positive(TP)  = ', TP)
# print('False Positive(FP) = ', FP)
# print('True Negative(TN)  = ', TN)
# print('False Negative(FN) = ', FN)

# accuracy_score(y_test, predictions)


### Predictions decision tree

In [138]:
# from sklearn import preprocessing
# lbl = preprocessing.LabelEncoder()
# df_test['director_1'] = lbl.fit_transform(df_test['director_1'].astype(str))


# # fill all NaN with 0, THIS NEEDS TO BE CHANGED!! But the model does not accept NaN values
# test_df = df_test.fillna(0)

# ss_train = StandardScaler()
# test_df = ss_train.fit_transform(test_df[['runtimeMinutes', 'numVotes', 'director_1']])

# predictions_test = clf.predict(test_df)

# pd.DataFrame(predictions_test).to_csv('test_results_decisiontree.csv', index=False, header=False)

In [139]:
# from sklearn import preprocessing
# lbl = preprocessing.LabelEncoder()
# df_validation['director_1'] = lbl.fit_transform(df_validation['director_1'].astype(str))


# # fill all NaN with 0, THIS NEEDS TO BE CHANGED!! But the model does not accept NaN values
# validated_df = df_validation.fillna(0)

# ss_train = StandardScaler()
# validated_df = ss_train.fit_transform(validated_df[['runtimeMinutes', 'numVotes', 'director_1']])

# predictions_validation = clf.predict(validated_df)

# predictions_validation

# pd.DataFrame(predictions_validation).to_csv('validation_results_decisiontree.csv', index=False, header=False)

## XGBoost

In [131]:
# Step 1: Define explanatory (fit into model) and target variables
X = df_train.drop(['primaryTitle', 'tconst', 'film', 'name', 'label', 'movie', 'movie_2'], axis=1)
# X = df_train[['startYear', ]]
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Iterate over column names
for column in X:
    if X[column].dtype.kind == 'O':
        X[column]= label_encoder.fit_transform(X[column])
                          
y = df_train['label']

X

Unnamed: 0,startYear,runtimeMinutes,numVotes,year_film,winner,writer_nm0000005,writer_nm0000019,writer_nm0000027,writer_nm0000033,writer_nm0000036,...,director_nm9818807,director_nm9825418,director_nm9902054,director_nm9942830,director_nm9955258,director_nm9958352,director_nm9985316,production_budget,domestic_gross,worldwide_gross
0,44,123,73331.0,1962.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,91,751,772
1,45,112,21102.0,1963.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62,23,33
2,55,129,,1973.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,129,174,183
3,66,164,,1984.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87,370,376
4,75,118,20131.0,1993.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38,497,498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7942,66,72,1557.0,,2,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,190,835,854
7943,78,150,16296.0,,2,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,190,835,854
7944,89,80,7786.0,,2,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,190,835,854
7945,102,129,2561.0,,2,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,190,835,854


In [132]:
# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)

# Step 3: Create and fit data into the XGB model
#model = XGBClassifier(eval_metric='mlogloss')


model = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)

eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)

# Step 4: Make predictions on the testing data
y_pred = model.predict(X_test)

# Step 5: Calculate the accuracy score by comparing the actual values and predicted values.
accuracy = accuracy_score(y_test, y_pred)
accuracy



Parameters: { "silent" } are not used.

[0]	validation_0-logloss:0.69105
[1]	validation_0-logloss:0.68902
[2]	validation_0-logloss:0.68823
[3]	validation_0-logloss:0.68654
[4]	validation_0-logloss:0.68528
[5]	validation_0-logloss:0.68405
[6]	validation_0-logloss:0.68310
[7]	validation_0-logloss:0.68223
[8]	validation_0-logloss:0.68132
[9]	validation_0-logloss:0.67995
[10]	validation_0-logloss:0.67843
[11]	validation_0-logloss:0.67757
[12]	validation_0-logloss:0.67572
[13]	validation_0-logloss:0.67485
[14]	validation_0-logloss:0.67358
[15]	validation_0-logloss:0.67229
[16]	validation_0-logloss:0.67111
[17]	validation_0-logloss:0.66988
[18]	validation_0-logloss:0.66833
[19]	validation_0-logloss:0.66711
[20]	validation_0-logloss:0.66539
[21]	validation_0-logloss:0.66462
[22]	validation_0-logloss:0.66393
[23]	validation_0-logloss:0.66282
[24]	validation_0-logloss:0.66170
[25]	validation_0-logloss:0.66004
[26]	validation_0-logloss:0.65838
[27]	validation_0-logloss:0.65712
[28]	validation_0-

[233]	validation_0-logloss:0.53937
[234]	validation_0-logloss:0.53937
[235]	validation_0-logloss:0.53908
[236]	validation_0-logloss:0.53906
[237]	validation_0-logloss:0.53899
[238]	validation_0-logloss:0.53862
[239]	validation_0-logloss:0.53854
[240]	validation_0-logloss:0.53842
[241]	validation_0-logloss:0.53833
[242]	validation_0-logloss:0.53781
[243]	validation_0-logloss:0.53752
[244]	validation_0-logloss:0.53702
[245]	validation_0-logloss:0.53686
[246]	validation_0-logloss:0.53671
[247]	validation_0-logloss:0.53662
[248]	validation_0-logloss:0.53645
[249]	validation_0-logloss:0.53634
[250]	validation_0-logloss:0.53625
[251]	validation_0-logloss:0.53609
[252]	validation_0-logloss:0.53589
[253]	validation_0-logloss:0.53583
[254]	validation_0-logloss:0.53566
[255]	validation_0-logloss:0.53566
[256]	validation_0-logloss:0.53566
[257]	validation_0-logloss:0.53512
[258]	validation_0-logloss:0.53504
[259]	validation_0-logloss:0.53466
[260]	validation_0-logloss:0.53454
[261]	validation_0-l

[468]	validation_0-logloss:0.50973
[469]	validation_0-logloss:0.50964
[470]	validation_0-logloss:0.50964
[471]	validation_0-logloss:0.50946
[472]	validation_0-logloss:0.50944
[473]	validation_0-logloss:0.50941
[474]	validation_0-logloss:0.50933
[475]	validation_0-logloss:0.50933
[476]	validation_0-logloss:0.50927
[477]	validation_0-logloss:0.50919
[478]	validation_0-logloss:0.50918
[479]	validation_0-logloss:0.50915
[480]	validation_0-logloss:0.50908
[481]	validation_0-logloss:0.50896
[482]	validation_0-logloss:0.50896
[483]	validation_0-logloss:0.50896
[484]	validation_0-logloss:0.50891
[485]	validation_0-logloss:0.50886
[486]	validation_0-logloss:0.50883
[487]	validation_0-logloss:0.50877
[488]	validation_0-logloss:0.50876
[489]	validation_0-logloss:0.50860
[490]	validation_0-logloss:0.50847
[491]	validation_0-logloss:0.50842
[492]	validation_0-logloss:0.50840
[493]	validation_0-logloss:0.50833
[494]	validation_0-logloss:0.50832
[495]	validation_0-logloss:0.50827
[496]	validation_0-l

[703]	validation_0-logloss:0.50201
[704]	validation_0-logloss:0.50197
[705]	validation_0-logloss:0.50197
[706]	validation_0-logloss:0.50189
[707]	validation_0-logloss:0.50189
[708]	validation_0-logloss:0.50189
[709]	validation_0-logloss:0.50189
[710]	validation_0-logloss:0.50189
[711]	validation_0-logloss:0.50189
[712]	validation_0-logloss:0.50189
[713]	validation_0-logloss:0.50189
[714]	validation_0-logloss:0.50185
[715]	validation_0-logloss:0.50185
[716]	validation_0-logloss:0.50185
[717]	validation_0-logloss:0.50185
[718]	validation_0-logloss:0.50185
[719]	validation_0-logloss:0.50179
[720]	validation_0-logloss:0.50176
[721]	validation_0-logloss:0.50176
[722]	validation_0-logloss:0.50176
[723]	validation_0-logloss:0.50176
[724]	validation_0-logloss:0.50176
[725]	validation_0-logloss:0.50173
[726]	validation_0-logloss:0.50173
[727]	validation_0-logloss:0.50169
[728]	validation_0-logloss:0.50169
[729]	validation_0-logloss:0.50168
[730]	validation_0-logloss:0.50168
[731]	validation_0-l

0.7655356462066336

### Predictions XGBoost  # doesn't work we have to merge oscar data to validation and test set first 

In [144]:
# Define explanatory (fit into model) and target variables
X = df_validation.drop(['primaryTitle', 'tconst', 'film', 'name', 'movie', 'movie_2'], axis=1)

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Iterate over column names
for column in X:
    if X[column].dtype.kind == 'O':
        X[column]= label_encoder.fit_transform(X[column])
                          

# validation_set = df_validation[['runtimeMinutes', 'numVotes', 'director_1', 'director_1_dum', 'writer_1', 'writer_2', 'director_2']]
predictions_validation = model.predict(X)
pd.DataFrame(predictions_validation).to_csv('validation_results_xgboost.csv', index=False, header=False)

In [148]:
#Each line in these files must consist of either the string True or the string False

test_end = predictions_test.copy()                               
test_end = test_end.astype(bool)          # Transform integer to boolean
pd.DataFrame(test_end).to_csv('test_results_xgboost.csv', index=False, header=False)


validation_end = predictions_validation.copy()                               
validation_end = validation_end.astype(bool)          # Transform integer to boolean
pd.DataFrame(validation_end).to_csv('validation_results_xgboost.csv', index=False, header=False)



In [145]:
# Define explanatory (fit into model) and target variables
X = df_test.drop(['primaryTitle', 'tconst', 'film', 'name', 'movie', 'movie_2'], axis=1)

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Iterate over column names
for column in X:
    if X[column].dtype.kind == 'O':
        X[column]= label_encoder.fit_transform(X[column])
                          

# validation_set = df_validation[['runtimeMinutes', 'numVotes', 'director_1', 'director_1_dum', 'writer_1', 'writer_2', 'director_2']]
predictions_test = model.predict(X)
pd.DataFrame(predictions_test).to_csv('test_results_xgboost.csv', index=False, header=False)



In [126]:
con.close()