# Init


In [1]:
# Install necessary requirements
%pip install -r ../requirements.txt

# Change working directory to root
import os
if os.getcwd().endswith("notebooks"):
    %cd ..
    print(os.getcwd())

# Automatically reload changes in code
%load_ext autoreload
%autoreload 2

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
/Users/dorin/Desktop/Tabular Data Science/Association-Rules-for-Concept-Drifting
/Users/dorin/Desktop/Tabular Data Science/Association-Rules-for-Concept-Drifting


In [2]:
import logging
from typing import List

import pandas as pd

from association_finder.concept_drifts_finder import ConceptDriftsFinder
from association_finder.models import Transaction, ConceptDriftResult
logging.basicConfig()

# Read and parse file

In [11]:
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
import numpy as np

np.random.seed(0)

train_dataset_path = "datasets/netflix_data/netflix-rotten-tomatoes-metacritic-imdb.csv"
# df = pd.read_csv(train_dataset_path)
df = pd.read_csv(train_dataset_path, index_col='Title')

# df.drop('Boxoffice', inplace=True, axis=1)

df_train, df_val = train_test_split(df, test_size=0.3)

@dataclass
class TrainParams:
    
    na_columns_mean: Dict[int, float]
    numerical_columns_cut: Dict[str, List[float]]
    # dropped_columns: List[str] = None
    
        
def preprocess(df, train_params: Optional[TrainParams] = None):
    
    # Defining numeric and categorical columns
    numeric_columns = df.dtypes[(df.dtypes == "float64") | (df.dtypes == "int64")].index.tolist()
    very_numerical = [nc for nc in numeric_columns if df[nc].nunique() > 20]
    categorical_columns = [c for c in df.columns if c not in numeric_columns]
    ordinals = list(set(numeric_columns) - set(very_numerical))

    # Filling Null Values with the column's mean
    na_columns = df[very_numerical].isna().sum()
    na_columns = na_columns[na_columns > 0]

    na_columns_mean = {}
    
    # flag = True if train_params is None else False
    
    for nc in na_columns.index:
        if train_params is None:
            column_mean = df[nc].mean()

            # Save mean
            na_columns_mean[nc] = column_mean
        else:
            column_mean = train_params.na_columns_mean[nc]

        df[nc].fillna(column_mean, inplace=True)


    # Dropping and filling NA values for categorical columns:
    # drop if at least 70% are NA:
    # nul_cols = df[categorical_columns].isna().sum() / len(df)
    # drop_us = nul_cols[nul_cols > 0.7]
    # df = df.drop(drop_us.index, axis=1)
    
#     if flag:
#         dropped_columns= [d for d in drop_us.keys()]
#     else:
#         if train_params.dropped_columns is not None:
#             print(df)
#             print(train_params.dropped_columns)
#             print(df.columns.get_loc(train_params.dropped_columns[0]))
#             df = df.drop(df.columns.get_loc(train_params.dropped_columns[0]), axis=1)
#             # df = df.drop(df[train_params.dropped_columns[0]].index, axis=1)
    
    # Fill with a new 'na' category:
    categorical_columns = list(set(categorical_columns))# - set(drop_us.index))
    df[categorical_columns] = df[categorical_columns].fillna('na')

    # Fill Null values in ordinals with a new '-1' ordinal:
    df[ordinals] = df[ordinals].fillna(-1)
    
    df = df.copy()

    # Bin numerical data
    numerical_columns_cut = {}
    for c in very_numerical:
        if train_params is None:
            try:
                # df[c] = pd.qcut(df[c], 5, labels=["very low", "low", "medium", "high", "very high"])
                df[c], bins = pd.qcut(df[c], 5, labels=[1, 2, 3, 4, 5], retbins=True)
            except:
                # sometimes for highly skewed data, we cannot perform qcut as most quantiles are equal
                # df[c] = pd.cut(df[c], 5, labels=["very low", "low", "medium", "high", "very high"])
                df[c], bins = pd.cut(df[c], 5, labels=[1, 2, 3, 4, 5], retbins=True)

            # Make bin edges larger (infinity and -infinity)
            bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))

            # Save bin
            numerical_columns_cut[c] = bins
            
        else:
            # Use existing train bins
            bins = train_params.numerical_columns_cut[c]
            df[c] = pd.cut(df[c], labels=[1, 2, 3, 4, 5], bins=bins)
        
    return df, TrainParams(na_columns_mean, numerical_columns_cut)#,[d for d in drop_us.keys()]

df_train_prep, train_params = preprocess(df_train)
# df_train_prep, train_params, dropped_columns = preprocess(df_train)
# print (dropped_columns)

# Focusing on prominent columns:
good_columns = ['Genre', 'Languages', 'Series or Movie', 'Hidden Gem Score','Country Availability','Runtime',
                'Director', 'Writer', 'View Rating', 'IMDb Score', 'Rotten Tomatoes Score','Metacritic Score',
               'Awards Received','Awards Nominated For','Release Date','Netflix Release Date','IMDb Votes']
                #if column not in dropped_columns]
                #'Summary','Production House','Actors','Tags','Boxoffice',
# TODO: Check where Amit took the good columns list from

target_column = 'Hidden Gem Score' # TODO: What is the target column?? Is it Hidden Gem Score?

one_hot_columns =  ['Genre','Languages','Series or Movie','Country Availability','Runtime','Director','Writer','View Rating',
                  'Release Date','Netflix Release Date']#'Boxoffice',

def split_X_y(df_prep: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    X_columns = list(set(good_columns) - {target_column})#-set(dropped_columns)
    X = df_prep[X_columns]

    for one_hot_column in one_hot_columns:
        X = pd.concat([X, pd.get_dummies(X[one_hot_column], prefix=one_hot_column)], axis=1)
        X = X.drop(columns=[one_hot_column])

    y = df_prep[target_column]
    
    return X, y

(15480, 28)
(15480, 27)


# Baseline

In [8]:
from sklearn.linear_model import LogisticRegression
from typing import Tuple

X_train, y_train = split_X_y(df_train_prep)
print(X_train.shape)
X_val, y_val = split_X_y(preprocess(df_val, train_params)[0])
print(X_val.shape)

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

# TODO: Issue with different dropped columns in the train and in the validation.

(10836, 26240)
(4644, 13588)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
print(f"Train accuracy: {clf.score(X_train, y_train)}")
print(f"Validation accuracy: {clf.score(X_val, y_val)}")

Train accuracy: 0.9480435585086748


Feature names unseen at fit time:
- Country Availability_Argentina,Australia,Japan,United States,Brazil,Mexico,Colombia
- Country Availability_Argentina,Australia,Switzerland,Sweden,Belgium,France,Spain,Germany,Portugal,India,Czech Republic,Slovakia,Lithuania,Russia,Greece,Romania,South Africa,Singapore,South Korea,United States,Mexico,Canada,United Kingdom,Poland,Hong Kong,Japan,Iceland,Thailand,Hungary,Turkey,Malaysia,Brazil,Netherlands,Italy,Israel,Colombia
- Country Availability_Argentina,Brazil,United Kingdom,Australia,Switzerland,Sweden,Netherlands,Belgium,France,India,Iceland,Germany,Portugal,Lithuania,Russia,South Africa,Slovakia,Czech Republic,Hungary,Greece,Romania,Poland,Singapore,United States,Canada,Mexico,South Korea,Spain,Hong Kong,Japan,Israel,Italy,Thailand,Turkey,Malaysia,Colombia
- Country Availability_Argentina,Brazil,United Kingdom,Australia,Switzerland,Sweden,Netherlands,Belgium,France,Spain,Germany,India,Iceland,Portugal,Lithuania,Russia,Slovakia,Czech Republic,H

ValueError: X has 13588 features, but LogisticRegression is expecting 26240 features as input.

# Build model using rules

In [None]:
from association_finder.concept_engineering import ConceptEngineering

concept_engineering = ConceptEngineering()
X_train_rules = concept_engineering.fit_transform(X_train, df_train_prep[good_columns], target_column, one_hot_columns)
X_val_rules = concept_engineering.transform(X_val)

clf_rules = LogisticRegression(random_state=0).fit(X_train_rules, y_train)

print(f"Train accuracy: {clf_rules.score(X_train_rules, y_train)}")
print(f"Validation accuracy: {clf_rules.score(X_val_rules, y_val)}")

# Analyze concepts

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
concept_engineering.concepts_df