# Init


In [None]:
# Install necessary requirements
%pip install -r ../requirements.txt

# Change working directory to root
import os
if os.getcwd().endswith("notebooks"):
    %cd ..
    print(os.getcwd())

# Automatically reload changes in code
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from typing import List

import pandas as pd

from association_finder.concept_drifts_finder import ConceptDriftsFinder
from association_finder.models import Transaction, ConceptDriftResult
logging.basicConfig()

# Read and parse file

In [None]:
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
import numpy as np

np.random.seed(0)

train_dataset_path = "datasets/big_mart_sales/Train-Set.csv"
df = pd.read_csv(train_dataset_path, index_col='ProductID')

df_train, df_val = train_test_split(df, test_size=0.3)


# df_train_prep, train_params = preprocess(df_train)
df_train_prep, train_params, dropped_columns = preprocess(df_train)
print (dropped_columns)

# Focusing on prominent columns:
good_columns = [column for column in ['Weight', 'FatContent', 'ProductVisibility', 'ProductType', 'MRP',
                'OutletID', 'EstablishmentYear', 'OutletSize', 'LocationType', 'OutletType', 'OutletSales']
                if column not in dropped_columns]
# TODO: Check where Amit took the good columns list from

target_column = "OutletSales"

one_hot_columns = [column for column in ['FatContent','ProductType','OutletID','OutletSize','LocationType','OutletType']
                   if column not in dropped_columns]

def split_X_y(df_prep: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    X_columns = list(set(good_columns) - {target_column})
    X = df_prep[X_columns]

    for one_hot_column in one_hot_columns:
        X = pd.concat([X, pd.get_dummies(X[one_hot_column], prefix=one_hot_column)], axis=1)
        X = X.drop(columns=[one_hot_column])

    y = df_prep[target_column]
    
    return X, y

# Baseline

In [None]:
from sklearn.linear_model import LogisticRegression
from typing import Tuple

X_train, y_train = split_X_y(df_train_prep)
X_val, y_val = split_X_y(preprocess(df_val, train_params)[0])

clf = LogisticRegression(random_state=0, max_iter=10000).fit(X_train, y_train)

In [None]:
print(f"Train accuracy: {clf.score(X_train, y_train)}")
print(f"Validation accuracy: {clf.score(X_val, y_val)}")

# Expected:
# Train accuracy: 0.500167616493463
# Validation accuracy: 0.49706687524442705

# Build model using rules

In [None]:
from association_finder.concept_engineering import ConceptEngineering

concept_engineering = ConceptEngineering()
X_train_rules = concept_engineering.fit_transform(X_train, df_train_prep[good_columns], target_column, one_hot_columns)
X_val_rules = concept_engineering.transform(X_val)

clf_rules = LogisticRegression(random_state=0, max_iter=10000).fit(X_train_rules, y_train)

print(f"Train accuracy: {clf_rules.score(X_train_rules, y_train)}")
print(f"Validation accuracy: {clf_rules.score(X_val_rules, y_val)}")

# TODO: Same accuracy before and after the rules.

# Analyze concepts

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
concept_engineering.concepts_df

# TODO: No rules found!