In [22]:
import pandas as pd

In [23]:
pd.set_option('display.max_columns', None)

In [24]:
train = pd.read_csv('train_3.csv')
test = pd.read_csv('test_3.csv')

In [25]:
# target = train['DiagPeriodL90D']
# train.drop(columns=target.name, axis=1, inplace=True) 

In [26]:
train.drop([
    'patient_gender',
    'patient_state',
    'Division'
],
axis=1,
inplace=True
)

test.drop([
    'patient_gender',
    'patient_state',
    'Division'
],
axis=1,
inplace=True
)

In [27]:
def group_by_state(df):
    # Get unique states
    unique_states = df['Region'].unique()

    # Create a dictionary to store DataFrames for each state
    state_dfs = {}
    target_dfs = {}

    # Iterate over unique states
    for state in unique_states:
        # Filter DataFrame for the current state
        state_df = df[df['Region'] == state].copy()
        state_df.drop(['Region', 'patient_id', 'DiagPeriodL90D'], axis=1, inplace=True)

        target_df = df[df['Region'] == state]['DiagPeriodL90D'].copy()
        
        # Store the filtered DataFrame in the dictionary with state as the key
        state_dfs[state] = state_df
        target_dfs[state] = target_df

    return state_dfs, target_dfs

state_dfs, target_dfs = group_by_state(train)

In [28]:
from lasso import apply_lasso

In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
import numpy as np

def apply_lasso(df_slice, target, alpha=0.1,):
    # Splitting data into features (X) and target variable (y)
    y = target
    X = df_slice

    # Filter features based on correlation
    corr_matrix = X.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
    X_filtered = X.drop(columns=to_drop)

    # Standardize the Features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Fit LASSO regression
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_scaled, y)


    # Extract important features
    lasso_coefficients = lasso_model.coef_
    important_features = X.columns[np.abs(lasso_coefficients) > 0]

    return list(important_features)

In [30]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

def apply_catboost(df_slice, target, param_grid, cv=5):
    # Splitting data into features (X) and target variable (y)
    y = target
    X = df_slice

    # Filter features based on correlation if needed
    # (Omitted here assuming it's already done if necessary)

    # Standardize the Features if needed
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize CatBoost classifier
    catboost_model = CatBoostClassifier()

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(catboost_model, param_grid=param_grid, cv=cv, scoring='accuracy')
    grid_search.fit(X_scaled, y)

    # Get the best model from the grid search
    best_catboost_model = grid_search.best_estimator_

    return best_catboost_model, grid_search.best_params_, grid_search.best_score_

# Define the parameter grid for CatBoost
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.05, 0.1, ],
    'depth': [4, 6]
}

unique_states = train.Region.unique()

# Example usage for each slice
for state in unique_states:  # Assuming unique_states are the slices
    important_features = apply_lasso(state_dfs[state], target_dfs[state], alpha=0.1)
    best_catboost_model, best_params, best_score = apply_catboost(state_dfs[state][important_features], target_dfs[state], param_grid, cv=2)
    print("Best Parameters for", state, ":", best_params)
    print("Best Accuracy for", state, ":", best_score)


0:	learn: 0.6771140	total: 449us	remaining: 44.5ms
1:	learn: 0.6625728	total: 1.1ms	remaining: 54.1ms
2:	learn: 0.6493811	total: 1.51ms	remaining: 49ms
3:	learn: 0.6373902	total: 1.89ms	remaining: 45.5ms
4:	learn: 0.6282570	total: 2.25ms	remaining: 42.7ms
5:	learn: 0.6181855	total: 2.66ms	remaining: 41.7ms
6:	learn: 0.6090213	total: 3.04ms	remaining: 40.4ms
7:	learn: 0.6021647	total: 3.35ms	remaining: 38.5ms
8:	learn: 0.5944218	total: 3.79ms	remaining: 38.3ms
9:	learn: 0.5873573	total: 4.17ms	remaining: 37.5ms
10:	learn: 0.5808984	total: 4.65ms	remaining: 37.6ms
11:	learn: 0.5749969	total: 5.07ms	remaining: 37.2ms
12:	learn: 0.5695924	total: 5.56ms	remaining: 37.2ms
13:	learn: 0.5646459	total: 5.94ms	remaining: 36.5ms
14:	learn: 0.5610448	total: 6.23ms	remaining: 35.3ms
15:	learn: 0.5568169	total: 6.59ms	remaining: 34.6ms
16:	learn: 0.5529356	total: 6.97ms	remaining: 34ms
17:	learn: 0.5493718	total: 7.35ms	remaining: 33.5ms
18:	learn: 0.5460953	total: 7.72ms	remaining: 32.9ms
19:	learn

In [65]:
# Perform Lasso regression for each state
for state in unique_states:
    catboost_model = apply_catboost(state_dfs[state][important_features[state]], target_dfs[state], iterations=100, learning_rate=0.1, depth=6)

0:	learn: 0.6548217	total: 157ms	remaining: 15.5s
1:	learn: 0.6191613	total: 159ms	remaining: 7.81s
2:	learn: 0.5936366	total: 161ms	remaining: 5.22s
3:	learn: 0.5704062	total: 163ms	remaining: 3.92s
4:	learn: 0.5519129	total: 165ms	remaining: 3.14s
5:	learn: 0.5391203	total: 167ms	remaining: 2.62s
6:	learn: 0.5285723	total: 170ms	remaining: 2.25s
7:	learn: 0.5176552	total: 172ms	remaining: 1.97s
8:	learn: 0.5099712	total: 173ms	remaining: 1.75s
9:	learn: 0.5014953	total: 175ms	remaining: 1.58s
10:	learn: 0.4944204	total: 177ms	remaining: 1.43s
11:	learn: 0.4891905	total: 179ms	remaining: 1.31s
12:	learn: 0.4853827	total: 181ms	remaining: 1.21s
13:	learn: 0.4824635	total: 183ms	remaining: 1.13s
14:	learn: 0.4794007	total: 185ms	remaining: 1.05s
15:	learn: 0.4753775	total: 187ms	remaining: 984ms
16:	learn: 0.4731457	total: 189ms	remaining: 925ms
17:	learn: 0.4696394	total: 191ms	remaining: 872ms
18:	learn: 0.4667560	total: 193ms	remaining: 824ms
19:	learn: 0.4652018	total: 195ms	remaini