# Init


### Install necessary requirements

In [None]:
%pip install -r ../requirements.txt

### Change working directory and add jupyter reload

In [None]:
# Change working directory to root
import os
if os.getcwd().endswith("notebooks"):
    %cd ..
    print(os.getcwd())

# Automatically reload changes in code
%load_ext autoreload
%autoreload 2

### Imports, logging and pandas configuration

In [None]:
import logging
from typing import List
import pandas as pd
from association_finder.concept_drifts_finder import ConceptDriftsFinder
from association_finder.models import Transaction, ConceptDriftResult
from association_finder.concept_engineering import ConceptEngineering
from sklearn.model_selection import train_test_split
from association_finder.preprocessing import preprocess_dataset, split_X_y
from association_finder.one_vs_rest_classifier import OneVsRestClassifier, label_to_concept_transform_wrapper
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Logs config
logging.basicConfig(level=logging.INFO)

# Pandas config
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# Read, split and preprocess data

In [None]:
np.random.seed(0)

# Read file
train_dataset_path = "datasets/houseprices/train.csv"
df = pd.read_csv(train_dataset_path, index_col='Id')
target_column = "SalePrice"

# Drop rows with NaN values in the target column.
df.drop(df[df[target_column].isna()].index,inplace=True)

# Split
df_train, df_val = train_test_split(df, test_size=0.3)

# Preprocess    
df_train_prep, train_params = preprocess_dataset(df_train)

# Focusing on prominent columns:
good_columns = [column for column in ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'OverallCond', 'BldgType', 'LotArea',
                'GrLivArea', 'FullBath', 'BedroomAbvGr', 'LotFrontage', 'TotalBsmtSF', 'SalePrice'] if column not in 
                train_params.dropped_columns]
one_hot_columns = [column for column in ['BldgType'] if column not in train_params.dropped_columns]


In [None]:
# Prepare data for training
X_train, y_train = split_X_y(df_train_prep, good_columns, train_params, one_hot_columns, target_column)
X_val, y_val = split_X_y(preprocess_dataset(df_val, train_params)[0], good_columns, train_params, one_hot_columns, target_column)

# Find rules and analyze them

In [None]:
# Find association rules
concept_engineering = ConceptEngineering()
concept_engineering.fit(X_train, df_train_prep[good_columns], target_column, one_hot_columns)
concept_engineering.concepts_df

# Build models

### Baseline model

In [None]:
# Simple one vs rest classifier for baseline
one_vs_rest_classifier = OneVsRestClassifier()

In [None]:
y_train_pred = one_vs_rest_classifier.fit_transform(X_train, y_train)
y_val_pred = one_vs_rest_classifier.transform(X_val)

print(f"Train accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Validation accuracy: {accuracy_score(y_val, y_val_pred)}")

### Model using rules

In [None]:
# One vs rest classifier that uses rules (each label classifier uses its own rules)
label_to_transformation = {label: label_to_concept_transform_wrapper(concept_engineering, target_column, label) for label in y_train.unique()}
one_vs_rest_classifier = OneVsRestClassifier(label_to_transformation)

In [None]:
y_train_pred = one_vs_rest_classifier.fit_transform(X_train, y_train)
y_val_pred = one_vs_rest_classifier.transform(X_val)

print(f"Train accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Validation accuracy: {accuracy_score(y_val, y_val_pred)}")

### Error analysis

In [None]:
pred_df = pd.DataFrame(zip(clf_rules.predict(X_train_rules), y_train), columns=['predict', 'actual'], index=X_train.index)
pred_df = pd.merge(pred_df, X_train_rules, left_index=True, right_index=True)
errors_df = pred_df[pred_df['predict'] != pred_df['actual']]
errors_df[:50]

print(errors_df[errors_df['actual'] == 5].shape)   # 32 when saleprice: 5, 26 when saleprice: 1
print(errors_df[errors_df['actual'] == 1].shape)   # 49 when saleprice: 5: 49 when saleprice: 1
# errors_df[errors_df['actual'] == 1]

In [None]:
pred_df = pd.DataFrame(zip(clf.predict(X_train), y_train), columns=['predict', 'actual'], index=X_train.index)
pred_df = pd.merge(pred_df, X_train, left_index=True, right_index=True)
errors_df = pred_df[pred_df['predict'] != pred_df['actual']]
errors_df[:50]

print(errors_df[errors_df['actual'] == 5].shape)   # 30
print(errors_df[errors_df['actual'] == 1].shape)   # 46


print(errors_df[errors_df['predict'] == 5].shape)   # 30
print(errors_df[errors_df['predict'] == 1].shape)   # 46


### Model coefficients analysis

In [None]:
list(enumerate(sorted(list(zip(clf.coef_[3], X_train.columns)))))

In [None]:
list(enumerate(sorted(list(zip(clf_rules.coef_[3], X_train.columns)))))