<a name = "Libraries"></a>
## 1. Import relevant libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
pd.set_option('display.max_columns', None)
import warnings
from sklearn.model_selection import KFold,StratifiedKFold,RepeatedStratifiedKFold,train_test_split
import lightgbm as lgb 
import xgboost as xgb

seed = 111
from scipy import stats as st

warnings.filterwarnings('ignore')

<a name = "Load"></a>
## 2. Load files

In [None]:
# Read files to pandas dataframes
train = pd.read_csv('../input/landslide/Train.csv')
test = pd.read_csv('../input/landslide/Test.csv')
sample_submission = pd.read_csv('../input/landslide/SampleSubmission.csv')
sample=test.Sample_ID

In [None]:
train.drop('Sample_ID',axis=1,inplace=True)
test.drop('Sample_ID',axis=1,inplace=True)

<a name = "Preview"></a>
## 3. Feature Engineer

In [None]:
df=pd.DataFrame()
for i in ['elevation', 'geology', 'lsfactor', 'placurv', 'procurv', 'sdoif', 'slope', 'twi', 'aspect']:
  train[i] = train[[x for x in train.columns if i in x]].mean(axis = 1)
  train[i+'1'] = train[[x for x in train.columns if i in x]].std(axis = 1)
  train[i+'4'] = train[[x for x in train.columns if i in x]].median(axis = 1)
  train[i+'5'] = train[[x for x in train.columns if i in x]].var(axis = 1)
  train[i+'6'] = train[[x for x in train.columns if i in x]].sum(axis = 1)
  train[i+'7'] = train[[x for x in train.columns if i in x]].max(axis = 1)
  train[i+'8'] = train[[x for x in train.columns if i in x]].min(axis = 1)

train.head()

In [None]:
for i in ['elevation', 'geology', 'lsfactor', 'placurv', 'procurv', 'sdoif', 'slope', 'twi', 'aspect']:
  test[i] = test[[x for x in test.columns if i in x]].mean(axis = 1)
  test[i+'1'] = test[[x for x in test.columns if i in x]].std(axis = 1)
  test[i+'4'] = test[[x for x in test.columns if i in x]].median(axis = 1)
  test[i+'5'] = test[[x for x in test.columns if i in x]].var(axis = 1)
  test[i+'6'] = test[[x for x in test.columns if i in x]].sum(axis = 1)
  test[i+'7'] = test[[x for x in test.columns if i in x]].max(axis = 1)
  test[i+'8'] = test[[x for x in test.columns if i in x]].min(axis = 1)




test.head()

### Dataset imbalanced for that SMOTE is a good hack

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN

y = train.Label
X=train.drop('Label',axis=1)

X_resampled, y_resampled = SMOTE().fit_resample(X, y)

<a name = "Model"></a>
## 9. Model training using 10 StratifiedKfold with XGBClassifier

In [None]:
%%time
tess = test

# Stratified Validation
folds = StratifiedKFold(n_splits = 10)

# Dataframe to store feature importance
feature_importance_df = pd.DataFrame()

# Lists to store predictions and losses
season_predictions_xgb = []
losses = []
for i,( train_index, test_index) in enumerate(folds.split(X_resampled, y_resampled)):
  X_train, X_test, y_train, y_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index], y_resampled[train_index], y_resampled[test_index]

  # Instantiate model
  model1 = xgb.XGBClassifier(
                             n_estimators = 5000,
                             colsample_bytree=0.7,
                             subsample =0.6,
                             seed=11,
                             random_state = 11,
                           
                            )

  # Train model
  model1.fit(X_train, y_train,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=400,
            verbose = 1000,
)

  # Make predictions
  preds = model1.predict_proba(tess)
  y_pred = model1.predict_proba(X_test)

  # Append predictions and losses
  season_predictions_xgb.append(preds)
  loss = f1_score(y_test, model1.predict(X_test))

  # Append feature importance per fold
  #fold_importance_df = pd.DataFrame({'feature': X_train.columns.tolist(), 'importance': model.feature_importances_})
  #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

  # Print loss
  print(f'{i+1}:  {loss}\n')
  losses.append(loss)

print(f'Mean Loss: {np.mean(losses)}')

## Getting the mean of probs for the 10 Folds

In [None]:
xgb_preds = np.mean(season_predictions_xgb,axis=0)

In [None]:
#test_df = test
preds = [1 if x >= 0.3 else 0 for x in np.mean(season_predictions_xgb, axis = 0)[:, 1]]
sub_file = pd.DataFrame({'Sample_ID': sample, 'Label': preds})
# Check the distribution of your predictions
sns.countplot(x = sub_file.Label)
plt.title('Predicted Variable Distribution');

## 10. Model training using 10 StratifiedKfold with LGBMClassifier

In [None]:
%%time
tess = test

# Stratified Validation
folds = StratifiedKFold(n_splits = 10)

# Dataframe to store feature importance
feature_importance_df = pd.DataFrame()

# Lists to store predictions and losses
season_predictions_lgb = []
losses = []
for i,( train_index, test_index) in enumerate(folds.split(X_resampled, y_resampled)):
    X_train, X_test, y_train, y_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index], y_resampled[train_index], y_resampled[test_index]

  # Instantiate model
    model2 = lgb.LGBMClassifier(
    boosting_type= "gbdt",
    colsample_bytree= 0.9,
    learning_rate = 0.05,
    n_estimators = 3000,
    objective ='binary',
    random_state = 2022,)

  # Train model
    model2.fit(X_train, y_train,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=400,
            verbose = 1000,
)

  # Make predictions
    preds = model2.predict_proba(tess)
    y_pred = model2.predict_proba(X_test)

  # Append predictions and losses
    season_predictions_lgb.append(preds)
    loss = f1_score(y_test, model2.predict(X_test))

  # Append feature importance per fold
    #fold_importance_df = pd.DataFrame({'feature': X_train.columns.tolist(), 'importance': model.feature_importances_})
    #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

  # Print loss
    print(f'{i+1}:  {loss}\n')
    losses.append(loss)

print(f'Mean Loss: {np.mean(losses)}')

## Getting the mean of probs for the 10 Folds

In [None]:
lgb_preds = np.mean(season_predictions_lgb,axis=0)

In [None]:
preds = [1 if x >= 0.28 else 0 for x in np.mean(season_predictions_lgb, axis = 0)[:, 1]]
sub_file = pd.DataFrame({'Sample_ID': sample, 'Label': preds})
# Check the distribution of your predictions
sns.countplot(x = sub_file.Label)
plt.title('Predicted Variable Distribution');

### 11. Ensemble both preds of XGB and LightGBM and create a submit file

In [None]:

last_pred=lgb_preds*0.5 +  xgb_preds*0.5
pred=[1 if x >= 0.3 else 0 for x in last_pred[:, 1]]
sub_file = pd.DataFrame({'Sample_ID': sample, 'Label': pred})
# Check the distribution of your predictions
sns.countplot(x = sub_file.Label)
plt.title('Predicted Variable Distribution');

# Create a csv file and upload to zindi 
sub_file.to_csv('Baseline.csv', index = False)
sub_file.head()

<a name = "Tips"></a>
## 12. Tips to improve model performance
 - Use cross-validation techniques
 - Feature engineering
 - Handle the class imbalance of the target variable
 - Try different modelling techniques - Stacking classifier, Voting classifiers, ensembling...
 - Data transformations
 - Feature Selection techniques such as RFE, Tree-based feature importance...
 - Domain Knowledge, do research on how the provided features affect landslides, soil topology...

#                       ::Upvote will be very appreciated 😊


