In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingRegressor, HistGradientBoostingRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, classification_report, cohen_kappa_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import minmax_scale
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator,TransformerMixin
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, plot_importance
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import seaborn as sns

child_mind_train_path = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
train = pd.read_csv(child_mind_train_path)
child_mind_test_path = '/kaggle/input/child-mind-institute-problematic-internet-use/test.csv'
test = pd.read_csv(child_mind_test_path)

# large_na_features = train.count().sort_values(ascending=True)[0:11].keys()
# train.count().sort_values(ascending=True)[0:11]
# train.sii.isna().sum()

### This is seperating out the intermediate features that are not in the test set, so that they can be used for fitting and prediction later

In [2]:
features_not_in_test = pd.Index(['PCIAT-Season', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
                    'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09',
                    'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13',
                    'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18',
                    'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total'])
intermediates = pd.Index(['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
                    'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09',
                    'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13',
                    'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18',
                    'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20'])

In [3]:
# pd.concat([train[features_not_in_test], train['sii']], axis=1).dropna(subset='sii').drop('PCIAT-Season', axis=1).corr()
# # train[*features_not_in_test, 'sii']
# # train['sii'].dropna().count()
# # train['PCIAT-PCIAT_Total'].dropna().count()
# plt.figure(figsize=(14,5))
# sns.heatmap(pd.concat([train[features_not_in_test], train['sii']], axis=1).dropna(subset='sii').drop('PCIAT-Season', axis=1).corr(), annot=True)

In [4]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X=X, y=y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores
    
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # log_y = np.log(y)
    score = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error", error_score='raise')
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

def feature_engineering(df):
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    return df

def drop_uninformative(df, mi_scores):
    return df.loc[:, mi_scores > 0.0]

def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs

def get_sii(total):
    sii = []
    for i in total:
        pciat = round(i)
        if pciat <= 30:
            sii.append(0)
        if 30 < pciat <= 49:
            sii.append(1)
        if 49 < pciat <= 79:
            sii.append(2)
        if 79 < pciat:
            sii.append(3)
    return sii

### This is seperating the data into the features X and the target y, and getting the categoricals and numerical columns

In [5]:
X = train.drop(columns=['id', 'PCIAT-Season'])
X = X.dropna(subset=intermediates)
y = X['PCIAT-PCIAT_Total']
X = X.drop(columns=intermediates)
X = X.drop(columns=['PCIAT-PCIAT_Total', 'sii'])
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

cols = categorical_cols + numerical_cols

### Here is the base features set, after dropping the features_not_in_test the shape of the data will be 60 for the columns, or 60 features. We will also drop the id and sii when fitting the model, giving us 58 features to use as the base feature set.

In [6]:
print(train.drop(columns=features_not_in_test).shape)
print(train.drop(columns=[*features_not_in_test, 'id', 'sii']).shape)

(3960, 60)
(3960, 58)


### This is getting the test_set ready to be used for making predictions

In [7]:
test_set = test.drop(columns=['id'])

### This is the pipeline for doing preprocessing on the data and fitting and transforming the data

In [8]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    # ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Define model
model = XGBRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

### Here is a loop over the intermediate features, fitting the model to with all the X excluding the id, PCIAT-Season, sii and the intermediates, and also fit the model with the y which is the intermediate feature in the loop currently and then finally use the fitted pipeline to predict the intermediate feature and save it into the total dataframe which contains all the predictions on the test set
### For example, here, the first i in the loop would be 'PCIAT-PCIAT_01', the model would be fitted to predict 'PCIAT-PCIAT_01' with the 58 features of the data, then we would use the model to predict 'PCIAT-PCIAT_01' for the test_set and save the predictions into the total dataframe. Then we continue this for the rest of the intermediate features

In [9]:
total = pd.DataFrame()

for i in intermediates:
    X = train.drop(columns=['id', 'PCIAT-Season'])
    # X = feature_engineering(X)
    X = X.dropna(subset=intermediates)
    y = X[i]
    X = X.drop(columns=intermediates)
    X = X.drop(columns=['PCIAT-PCIAT_Total', 'sii'])

    pipeline.fit(X, y)
    preds = pipeline.predict(test_set)
    total[i] = preds

total

Unnamed: 0,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20
0,4.245764,3.74269,3.686419,-0.0152,3.81383,0.036365,0.165225,3.738711,0.194318,0.234359,3.787778,0.054549,3.77493,3.462678,3.532596,3.694241,3.7876,3.504677,1.736118,3.597461
1,0.033577,0.163467,0.139519,0.025604,0.045861,0.033358,-0.102155,-0.011276,0.099402,0.153275,-0.051612,0.003638,0.138485,0.016176,0.251423,0.152441,0.009978,0.095146,0.013429,0.007339
2,5.006371,1.985799,2.110252,1.050378,2.101458,0.953854,0.884181,2.110112,0.91543,0.995694,1.055312,0.05355,1.038247,0.967226,1.452485,0.209468,2.119663,2.065896,0.991374,1.056379
3,3.589075,1.844492,3.009153,0.371589,4.06524,0.862527,0.136333,2.821492,1.619271,1.693961,2.850233,0.064979,2.436216,0.188625,0.651104,2.65543,3.624011,2.620429,3.497388,1.017665
4,1.629138,2.147586,2.944834,1.319912,1.771556,0.814691,0.760154,2.02734,1.244899,0.567111,4.056649,0.641177,0.540485,2.539373,2.708498,-0.129092,2.772465,0.683465,2.298763,0.364319
5,2.70891,3.072343,2.878683,-0.011127,1.978982,0.956772,-0.031428,1.94971,1.875327,1.040185,0.110264,0.969224,2.817169,2.865147,1.917022,0.98761,2.914338,1.064545,1.95826,0.934576
6,0.998212,3.89764,0.913962,0.009993,1.746632,0.920576,0.004044,1.00366,-0.037926,0.177468,0.261453,0.046954,0.173524,0.0747,-0.061157,3.958378,0.984719,3.614203,0.977507,0.005757
7,2.640818,1.632511,2.670443,0.66674,2.548574,1.040436,-0.166978,-9.5e-05,1.273198,0.283108,2.684792,0.100133,0.196995,1.053704,1.410083,1.171989,1.524239,1.034634,0.593925,0.821544
8,2.944787,3.257908,2.920293,1.347505,3.780228,2.254618,1.549703,2.162191,2.708497,1.976204,3.131001,0.382988,1.665565,2.43359,2.420449,2.189654,2.607102,1.634581,2.456585,1.686893
9,1.128428,2.793718,2.026349,0.756189,1.092664,0.892843,2.422997,1.080304,2.388988,1.217036,3.565168,0.10323,0.6491,1.638415,1.765369,0.580589,2.064221,0.451583,0.999119,0.44294


In [10]:
total.sum(axis=1)

0     50.775116
1      1.217076
2     29.123129
3     39.619217
4     31.703323
5     32.956509
6     19.670296
7     23.180792
8     45.510345
9     28.059254
10    33.579575
11    18.763161
12    31.482126
13    36.013393
14    42.787880
15    53.537098
16     3.967383
17    23.410807
18    28.049698
19    39.298557
dtype: float32

### The total dataframe is used to calculating the total for the row across all the PCIAT, or in other words, the PCIAT-PCIAT_Total. We then have a get_sii function that takes the total and converts it into an array with the corresponding sii like the data-dictionary.csv says

In [11]:
def get_sii(total):
    sii = []
    for i in total:
        pciat = round(i)
        if pciat <= 30:
            sii.append(0)
        if 30 < pciat <= 49:
            sii.append(1)
        if 49 < pciat <= 79:
            sii.append(2)
        if 79 < pciat:
            sii.append(3)
    return sii

### This is converting the total dataframe into the prediction of the test set

In [12]:
test_set_predictions = get_sii(total.sum(axis=1))
test_set_predictions

[2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 0, 0, 0, 1]

### This final step is making the output in the correct formatting and saving it to submission.csv

In [13]:
# Save test predictions to file
output = pd.DataFrame({'id': test.id.values,
                       'sii': test_set_predictions})
output.to_csv('submission.csv', index=False)

In [14]:
# num_train_X = train_X[numerical_cols]
# num_train_X_scaled = (num_train_X - num_train_X.mean(axis=0)) / num_train_X.std(axis=0)
# num_train_X_scaled
# pca = PCA()
# X_pca = pca.fit_transform(num_train_X_scaled)

# # Convert to dataframe
# component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
# X_pca = pd.DataFrame(X_pca, columns=component_names)

# X_pca.head()

In [15]:
# X.hist(bins=10, figsize=(20, 16))
# plt.tight_layout()
# plt.show()
# X.isnull().sum().plot(kind='bar',figsize=(20,8))
# plt.xlabel('variables')
# plt.ylabel('null count')
# plt.title('null per variables')
# plt.show()

In [16]:
# X[numerical_cols].corr()
# # pd.concat([X[numerical_cols], y], axis=1).corr()['sii'].sort_values(ascending=False)[1:].plot(kind='bar',figsize=(20,8))
# # plt.xlabel('features')
# # plt.ylabel('sii')
# # plt.title('features to sii correlation')
# # plt.show()
# high_corr_features = pd.concat([X[numerical_cols], y], axis=1).corr()['sii'].sort_values(ascending=False).abs()[1:11].index

In [17]:
# sns.heatmap(X[numerical_cols].corr())

In [18]:
def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings


def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs


def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [19]:
# my_imputer = SimpleImputer(strategy='mean', fill_value=0)
# num_X = train_X[numerical_cols]
# im_X = pd.DataFrame(my_imputer.fit_transform(num_X))
# im_X.columns = num_X.columns
# im_X.index = num_X.index
# # num_test = test_set[numerical_cols]
# # im_test = pd.DataFrame(my_imputer.fit_transform(num_test))
# # im_test.columns = num_test.columns
# # im_X.round()
# # mi_scores = make_mi_scores(im_X, y, discrete_features='auto')
# # mi_scores[::]  # show a few features with their MI scores
# # plt.figure(dpi=100, figsize=(8, 15))
# # plot_mi_scores(mi_scores)
# # sns.relplot(y='Physical-Height', x='sii', data=train)

# im_X_scaled = (im_X - im_X.mean(axis=0)) / im_X.std(axis=0)
# pca = PCA()
# X_pca = pca.fit_transform(im_X_scaled)

# # Convert to dataframe
# component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
# X_pca = pd.DataFrame(X_pca, columns=component_names)

# X_pca.head()

# loadings = pd.DataFrame(
#     pca.components_.T,  # transpose the matrix of loadings
#     columns=component_names,  # so the columns are the principal components
#     index=im_X_scaled.columns,  # and the rows are the original features
# )
# sns.heatmap(loadings)
# plot_variance(pca)
# # loadings
# # abs_loadings = loadings.abs()

# # # Find the feature with the highest loading for each principal component
# # top_features = abs_loadings.idxmax()
# # top_features_values = abs_loadings.max()

# # # Combine into a DataFrame for easier analysis
# # top_features_df = pd.DataFrame({
# #     "Top Feature": top_features,
# #     "Max Loading": top_features_values,
# # })

# # top_features_df.sort_values(by='Max Loading', ascending=False).head(10)

In [20]:
# # Define threshold for high loading
# threshold = 0.3

# # Initialize a new DataFrame for the generated features
# new_features = pd.DataFrame(index=loadings.index)

# # Iterate through principal components
# for pc in loadings.columns:
#     # Create a binary feature for high loading
#     new_features[f"{pc}_high_loading"] = (loadings[pc].abs() > threshold).astype(int) * loadings[pc]

# # Optionally, aggregate rows with high loadings (e.g., mean of high-loading values)
# high_loading_means = loadings[loadings.abs() > threshold].mean(axis=1)
# new_features["high_loading_mean"] = high_loading_means

# # print("Original Loadings:")
# # print(loadings)

# print("\nGenerated Features:")
# new_features

# def features(df, loadings=new_features):
#     for i in loadings:
#         df[i] = 0
#         for index, j in enumerate(loadings[i]):
#             df[i] += df[loadings.index[index]] * j
#     return df

In [21]:
# class FeatureCreator(BaseEstimator, TransformerMixin):
#     def __init__(self, column_transformer):
#         self.column_transformer = column_transformer
    
#     def fit(self, X, y=None):
#         self.column_transformer.fit(X, y)
#         # Get the output column names
#         self.feature_names = self.get_feature_names_out(X)
#         return self
    
#     def transform(self, X, loadings=new_features):
#         X_transformed = self.column_transformer.transform(X)
#         X = pd.DataFrame(X_transformed, columns=self.feature_names, index=X.index)

#         # for i in loadings:
#         #     X[i] = 0
#         #     for index, j in enumerate(loadings[i]):
#         #         X[i] += X[loadings.index[index]] * j
#         return feature_engineering(X)
    
#     def get_feature_names_out(self, X):
#         """Extract feature names from the column transformer."""
#         output_features = []
#         for name, transformer, columns in self.column_transformer.transformers_:
#             if transformer == "drop":
#                 continue
#             elif transformer == "passthrough":
#                 output_features.extend(columns)
#             else:
#                 # Handle transformers like OneHotEncoder
#                 if hasattr(transformer, "get_feature_names_out"):
#                     feature_names = transformer.get_feature_names_out(columns)
#                     output_features.extend(feature_names)
#                 else:
#                     output_features.extend(columns)
#         return output_features

In [22]:
# X_ = im_X[numerical_cols].copy()
# y_ = y
# X_ = X_.loc[:, high_corr_features]

# # `apply_pca`, defined above, reproduces the code from the tutorial
# pca, X_pca, loadings = apply_pca(X_)
# X_pca.index = X_.index
# X_pca

In [23]:
def get_score(n_estimators):
    """Return the average MAE over 3 CV folds of random forest model.
    
    Keyword argument:
    n_estimators -- the number of trees in the forest
    """
    my_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=3,
                              scoring='neg_mean_absolute_error')
    print(scores)
    return scores.mean()

In [24]:
# loadings = pd.DataFrame(
#     pca.components_.T,  # transpose the matrix of loadings
#     columns=component_names,  # so the columns are the principal components
#     index=im_X_scaled.columns,  # and the rows are the original features
# )
# sns.heatmap(loadings)
# loadings

In [25]:
# cv = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
# for train_idx, val_idx in cv.split(X, y):
#     # print(train_idx)
#     # print(val_idx)
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
#     pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('model', model)
#                      ])
#     pipeline.fit(train_X, train_y)
#     preds = pipeline.predict(val_X)
#     # print(preds)
#     print('kappa:', cohen_kappa_score(preds.round(), val_y, weights='quadratic'))

In [26]:
# # Preprocessing for numerical data
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     # ('scaler', StandardScaler())
# ])

# # Preprocessing for categorical data
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder())
# ])

# # Bundle preprocessing for numerical and categorical data
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ]
# )

# # Define model
# model = XGBRegressor(n_estimators=1000, random_state=0)
# voting_regressor = VotingRegressor(estimators=[
#     ('lightgbm', LGBMRegressor()),
#     ('xgboost', XGBRegressor()),
#     ('catboost', CatBoostRegressor())
# ])
# # Bundle preprocessing and modeling code in a pipeline
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', voting_regressor)
# ])

# # scores = -1 * cross_val_score(pipeline, X[cols], y, cv=5, scoring='neg_mean_absolute_error')

# # print('MAE scores:\n', scores.mean())

# val_X_preprocessed = pipeline.named_steps['preprocessor'].fit_transform(val_X)
# # test_set_preprocessed = pipeline.named_steps['preprocessor'].fit_transform(test_set)

# # Preprocessing of training data, fit model 
# pipeline.fit(
#     train_X, train_y,
#     # X, y,
#     # im_X, y,
#     # model__eval_set=[(val_X_preprocessed, val_y)],
# )
# # plt.figure(figsize=(100, 10))
# # plot_importance(model, max_num_features=10)
# # Preprocessing of validation data, get predictions
# preds = pipeline.predict(val_X)
# score = pipeline.score(val_X, val_y)

# # print('kappa:', cohen_kappa_score(preds.round().clip(0, 3), val_y, weights='quadratic'))
# print(cohen_kappa_score(get_sii(preds), get_sii(val_y), weights='quadratic'))
# score

In [27]:
# pipeline.predict(test_set)

In [28]:
# cohen_kappa_score(preds.round().clip(0, 3), val_y, weights='quadratic')

In [29]:
# importances = model.feature_importances_
# indices = np.argsort(importances)[::-1]

# plt.figure(figsize=(20, 5))
# plt.title("Feature Importances")
# plt.bar(range(val_X_preprocessed.shape[1]), importances[indices], color="r", align="center")
# plt.xticks(range(val_X_preprocessed.shape[1]), [val_X.columns[i] for i in indices], rotation=90)
# plt.xlim([-1, val_X_preprocessed.shape[1]])
# plt.show()

In [30]:
# preds.round().astype(int)

In [31]:
# # Save test predictions to file
# output = pd.DataFrame({'id': test.id.values,
#                        'sii': preds.round().astype(int)})
# output.to_csv('submission.csv', index=False)