In [52]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from supervised import AutoML
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

In [87]:
train_data = pd.read_csv('../data_new/train.csv')
test = pd.read_csv('../data_new/test.csv')



In [55]:
target = train_data.iloc[:,-29:]
target_cols = train_data.iloc[:,-29:].columns

In [56]:
mask = train_data[target_cols].apply(lambda x: 9 not in x.values, axis=1)
train = train_data[mask]

In [57]:
train.shape

(1356, 365)

In [58]:
def prepare_train(train):
    
    target = train.iloc[:,-29:]
    train_df = train.drop(target, axis=1)
    
    drop_cols = ['Patient_ID', 'Chip_Code', 'Chip_Image_Name', 
                 'Food_Type_0', 'French_Residence_Department']
    
    cols_tobe_enc = ['French_Region', 'Chip_Type', 
                     'Blood_Month_sample', 'Skin_Symptoms',
                     'Treatment_of_rhinitis','Treatment_of_athsma',
                     'General_cofactors', 'Treatment_of_atopic_dematitis',
                     'Age_of_onsets', 'Rural_or_urban_area']
    
    train_df.Treatment_of_rhinitis = train_df.Treatment_of_rhinitis.astype(str).str.split(pat='.', expand=True)[0]
    
    train_df = train_df.drop(drop_cols, axis=1)

    train_df = pd.get_dummies(train_df, columns=cols_tobe_enc)
    
    return train_df, target

def prepare_test(train):
    
    drop_cols = ['Patient_ID', 'Chip_Code', 'Chip_Image_Name', 
                 'Food_Type_0', 'French_Residence_Department']
    
    cols_tobe_enc = ['French_Region', 'Chip_Type', 
                     'Blood_Month_sample', 'Skin_Symptoms',
                     'Treatment_of_rhinitis','Treatment_of_athsma',
                     'General_cofactors', 'Treatment_of_atopic_dematitis',
                     'Age_of_onsets', 'Rural_or_urban_area']
    
    train.Treatment_of_rhinitis = train.Treatment_of_rhinitis.astype(str).str.split(pat='.', expand=True)[0]
    
    train = train.drop(drop_cols, axis=1)

    train = pd.get_dummies(train, columns=cols_tobe_enc)
    
    return train

In [59]:
train, target = prepare_train(train)

In [60]:
test = prepare_test(test)

In [61]:
extra_cols = train.iloc[:,3:321].columns

In [62]:
clean_train = train.drop(extra_cols, axis=1)

In [63]:
clean_test = test.drop(extra_cols, axis=1)

In [64]:
clean_train.shape

(1356, 123)

In [65]:
clean_test = clean_test.drop('trustii_id', axis=1)

In [66]:
clean_test.shape

(586, 91)

In [67]:
clean_train.shape

(1356, 123)

In [70]:
df1 = clean_train
df2 = clean_test
common_cols = set(df1.columns).intersection(df2.columns)
X_updated = df1.reindex(columns=common_cols)
test_updated = df2.reindex(columns=common_cols)

In [69]:
clean_train = clean_train.loc[:,~clean_train.columns.duplicated()].copy()

In [71]:
from sklearn.model_selection import train_test_split
import time
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV, KFold
x_train, x_test, y_train, y_test = train_test_split(X_updated, target, test_size=0.2, random_state=17)

In [None]:
clf = xgb.XGBClassifier()

# Create the One-vs-Rest classifier
ovr_clf = OneVsRestClassifier(clf)

# Define the hyperparameters and their possible values
param_grid = {
    'estimator__n_estimators': [100, 200, 300],  # Number of trees
    'estimator__max_depth': [3, 5, 7],  # Maximum depth of each tree
    'estimator__learning_rate': [0.1, 0.01],  # Learning rate
    'estimator__subsample': [0.8, 1.0],  # Subsample ratio of the training instances
    'estimator__colsample_bytree': [0.8, 1.0]  # Subsample ratio of columns when constructing each tree
}

# Define the number of folds for cross-validation
n_folds = 5

# Create a cross-validation object with 5 folds
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Create a GridSearchCV object with the OvR classifier, hyperparameters, and cross-validation
grid_search = GridSearchCV(ovr_clf, param_grid, cv=kfold, scoring='accuracy')

# Perform hyperparameter tuning and time it
start_time = time.time()
grid_search.fit(X_updated, target)
end_time = time.time()

# Print the best parameters and the corresponding score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Print the execution time
execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

In [72]:
xgb_classifier = xgb.XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
    subsample=0.8
)

In [73]:
model = OneVsRestClassifier(xgb_classifier)

In [80]:
model.fit(X_updated, target)



In [81]:
pred = model.predict(test_updated)

In [84]:
def merge_pred(test, predictions, name, target_cols):
    pred_df = pd.DataFrame(predictions, columns=target_cols)
    
    concatenated_df = pd.concat([test, pred_df], axis=1)
    concatenated_df.to_csv("{}_submission.csv".format(name), index=False)

In [90]:
merge_pred(test, pred, '30_6', target_cols)

In [88]:
test.shape

(586, 337)