# Init


In [None]:
# Install necessary requirements
%pip install -r ../requirements.txt

# Change working directory to root
import os
if os.getcwd().endswith("notebooks"):
    %cd ..
    print(os.getcwd())

# Automatically reload changes in code
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from typing import List

import pandas as pd

from association_finder.concept_drifts_finder import ConceptDriftsFinder
from association_finder.models import Transaction, ConceptDriftResult
logging.basicConfig()

# Read and parse file

In [None]:
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
from association_finder.preprocessing import preprocess_dataset, split_X_y
import numpy as np

np.random.seed(0)

train_dataset_path = "datasets/netflix_data/netflix-rotten-tomatoes-metacritic-imdb.csv"
# df = pd.read_csv(train_dataset_path)
df = pd.read_csv(train_dataset_path, index_col='Title')

df_train, df_val = train_test_split(df, test_size=0.3)

df_train_prep, train_params = preprocess_dataset(df_train)

# Focusing on prominent non-textual columns:
good_columns = ['Genre', 'Languages', 'Series or Movie', 'Hidden Gem Score','Country Availability','Runtime',
                'Director', 'Writer', 'View Rating', 'IMDb Score', 'Rotten Tomatoes Score','Metacritic Score',
               'Awards Received','Awards Nominated For','Release Date','Netflix Release Date','IMDb Votes']
                #'Summary','Production House','Actors','Tags','Boxoffice',

target_column = 'Hidden Gem Score' # TODO: What is the target column?? Is it Hidden Gem Score?

one_hot_columns =  [column for column in ['Genre','Languages','Series or Movie','Country Availability','Runtime','Director','Writer','View Rating',
                  'Release Date','Netflix Release Date']  if column not in train_params.dropped_columns]#'Boxoffice',


# Baseline

In [None]:
from sklearn.linear_model import LogisticRegression
from typing import Tuple

X_train, y_train = split_X_y(df_train_prep, good_columns, train_params, one_hot_columns, target_column)
X_val, y_val = split_X_y(preprocess_dataset(df_val, train_params)[0], good_columns, train_params, one_hot_columns, target_column)

clf = LogisticRegression(random_state=0, max_iter=10000).fit(X_train, y_train)


In [None]:
print(f"Train accuracy: {clf.score(X_train, y_train)}")
print(f"Validation accuracy: {clf.score(X_val, y_val)}")

# Build model using rules

In [None]:
from association_finder.concept_engineering import ConceptEngineering

concept_engineering = ConceptEngineering()
columns_to_use = list(set(good_columns) - set(train_params.dropped_columns))
X_train_rules = concept_engineering.fit_transform(X_train, df_train_prep[columns_to_use], target_column, one_hot_columns)
X_val_rules = concept_engineering.transform(X_val)

clf_rules = LogisticRegression(random_state=0, max_iter=10000).fit(X_train_rules, y_train)

print(f"Train accuracy: {clf_rules.score(X_train_rules, y_train)}")
print(f"Validation accuracy: {clf_rules.score(X_val_rules, y_val)}")

# Analyze concepts

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
concept_engineering.concepts_df