In [12]:
import os
import pandas as pd

In [13]:
files = "./data"

In [138]:
def preprocess_data(files):
    dfs = []
    directing_df = pd.DataFrame()
    writing_df = pd.DataFrame()
    for file in os.listdir(files):
        fn_no_ext = file.split(".")[0]
        if ".json" in file:
            globals()[fn_no_ext+"_df"] = pd.read_json(os.path.join(files, file))
            print("Created ", fn_no_ext+"_df", " dataframe.")
        elif ".csv" in file:
            if "validation" in file:
                validation_df = pd.read_csv(os.path.join(files, file), index_col=0, na_values=['\\N'])
                continue
            elif "test" in file:
                test_df = pd.read_csv(os.path.join(files, file), index_col=0, na_values=['\\N'])
                continue
            df = pd.read_csv(os.path.join(files, file), index_col=0, na_values=['\\N'])
            print("Appending: ", df.shape[0], " rows...")
            dfs.append(df)
            train_df = pd.concat(dfs, ignore_index=True)
        else: 
            pass
        
    directing_df = globals()['directing_df']
    directing_df.replace('\\N', pd.NA, inplace=True)
    writing_df = globals()['writing_df']    
    writing_df.replace('\\N', pd.NA, inplace=True)


    return train_df, validation_df, test_df, directing_df, writing_df
    


In [139]:
train, validation, test, directing_df, writing_df = preprocess_data(files)
print("Train shape: ", train.shape)
print("Validation shape: ", validation.shape)
print("Test shape: ", test.shape)

Created  directing_df  dataframe.
Appending:  963  rows...
Appending:  993  rows...
Appending:  951  rows...
Appending:  1049  rows...
Appending:  974  rows...
Appending:  1001  rows...
Appending:  1035  rows...
Appending:  993  rows...
Created  writing_df  dataframe.
Train shape:  (7959, 8)
Validation shape:  (955, 7)
Test shape:  (1086, 7)


## Data cleaning

In [140]:
# If needed

## EDA

There are N director for each movie

In [141]:
directors_df = pd.read_json(os.path.join(files, "directing.json"))
directors_df["movie"].value_counts()


movie
tt1687247    35
tt1935896    27
tt0401711    21
tt0099273    13
tt1333125    13
             ..
tt0137523     1
tt0137799     1
tt0138074     1
tt0138097     1
tt9911196     1
Name: count, Length: 10000, dtype: int64

There are M writters for each movie

In [143]:
writers_df = pd.read_json(os.path.join(files, "writing.json"))
writers_df["movie"].value_counts()
writers_df["writer"].value_counts()

writer
\N           297
nm0000636     25
nm0372942     25
nm0440604     18
nm0080327     16
            ... 
nm0813219      1
nm0207566      1
nm0104631      1
nm0513197      1
nm3547655      1
Name: count, Length: 15248, dtype: int64

## Simple Machine Learning Pipeline

In [144]:
# TODO add writters and directors as categorical features to the data, for instance movie tt0401711 will have 29 ones and 15247 zeros

In [145]:
# TODO do the same with directors

In [118]:
feature_cols = ["runtimeMinutes", "numVotes"]
X, y = train[feature_cols], train["label"]

In [123]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [125]:
clf = RandomForestClassifier(max_depth=2, random_state=42)

# Define the parameter grid to search
param_grid = {
    'max_depth': [2, 4, 6, 8],
    'n_estimators': [50, 100, 200, 300]
}


# Initialize GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# Fit the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best parameters
print("Best parameters:", best_params)

# Use the best estimator to make predictions
y_pred = best_estimator.predict(X_test)

# Measure accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Perform K-fold cross-validation and print the mean accuracy
cv_accuracy = cross_val_score(best_estimator, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated Accuracy:", cv_accuracy.mean())

Best parameters: {'max_depth': 4, 'n_estimators': 200}
Accuracy: 0.6890703517587939
Cross-validated Accuracy: 0.6799079049107105


## Prepare CSV files for kaggle

Asuming that the format will be index + prediction

In [131]:
validation["label"] = best_estimator.predict(validation[feature_cols])
test["label"] = best_estimator.predict(test[feature_cols])

validation["label"].to_csv("val_result.csv")
test["label"].to_csv("test_result.csv")