In [2]:
import pandas as pd
import  duckdb
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style("whitegrid")
import os

%load_ext autoreload
from duckdb_flow.etl import preprocess_data, get_processed_files
from duckdb_flow.create_database import create_database

The autoreload module is not an IPython extension.


In [4]:
STAGING_PATH = "./data"

In [3]:
def is_valid(file: str) -> bool:
    """
    Check wether the files are csv or json ignore the rest
    """
    return True if ".csv" in file or ".json" in file else False

In [5]:
trigger = 0
dfs = []
new_files = []
directing_df = pd.DataFrame()
writing_df = pd.DataFrame()
for file in os.listdir(STAGING_PATH):
    if is_valid(file):
        new_files.append(file)
        fn_no_ext = file.split(".")[0]
        if ".json" in file:
            globals()[fn_no_ext+"_df"] = pd.read_json(os.path.join(STAGING_PATH, file))
            print("Created ", fn_no_ext+"_df", " dataframe.")
        elif ".csv" in file:
            if "validation" in file:
                validation_df = pd.read_csv(os.path.join(STAGING_PATH, file), index_col=0, na_values=['\\N'])
                continue
            elif "test" in file:
                test_df = pd.read_csv(os.path.join(STAGING_PATH, file), index_col=0, na_values=['\\N'])
                continue
            df = pd.read_csv(os.path.join(STAGING_PATH, file), index_col=0, na_values=['\\N'])
            print("Appending: ", df.shape[0], " rows...")
            dfs.append(df)
            train_df = pd.concat(dfs, ignore_index=True)
        else: 
            pass
    
directing_df = globals()['directing_df']
writing_df = globals()['writing_df']    

Created  directing_df  dataframe.
Appending:  963  rows...
Appending:  993  rows...
Appending:  951  rows...
Appending:  1049  rows...
Appending:  974  rows...
Appending:  1001  rows...
Appending:  1035  rows...
Appending:  993  rows...
Created  writing_df  dataframe.


In [8]:
train_df

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,tt0010600,The Doll,Die Puppe,1919.0,,66.0,1898.0,True
1,tt0011841,Way Down East,Way Down East,1920.0,,145.0,5376.0,True
2,tt0012494,Déstiny,Der müde Tod,1921.0,,97.0,5842.0,True
3,tt0015163,The Navigator,The Navigator,1924.0,,59.0,9652.0,True
4,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925.0,,93.0,17887.0,True
...,...,...,...,...,...,...,...,...
7954,tt9625664,Trauma Center,,2019.0,,87.0,12951.0,False
7955,tt9741310,Slaxx,Slaxx,2020.0,,77.0,2464.0,False
7956,tt9742392,Kindred,Kindred,2020.0,,101.0,1719.0,False
7957,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020.0,,111.0,4144.0,True


In [11]:
feature_cols = ["runtimeMinutes", "numVotes"]
X, y = train_df[feature_cols], train_df["label"]

In [12]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
clf = RandomForestClassifier(max_depth=2, random_state=42)

# Define the parameter grid to search
param_grid = {
    'max_depth': [2, 4, 6, 8],
    'n_estimators': [50, 100, 200, 300]
}


# Initialize GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# Fit the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best parameters
print("Best parameters:", best_params)

# Use the best estimator to make predictions
y_pred = best_estimator.predict(X_test)

# Measure accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Perform K-fold cross-validation and print the mean accuracy
cv_accuracy = cross_val_score(best_estimator, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated Accuracy:", cv_accuracy.mean())

Best parameters: {'max_depth': 4, 'n_estimators': 200}
Accuracy: 0.6890703517587939
Cross-validated Accuracy: 0.6799079049107105


In [14]:
validation_df["label"] = best_estimator.predict(validation_df[feature_cols])
test_df["label"] = best_estimator.predict(test_df[feature_cols])

validation_df["label"].to_csv("val_result.csv", index=False, header=False)
test_df["label"].to_csv("test_result.csv", index=False, header=False)