In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler # Used for under sampling. explained further in notebook.
import collections
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import optuna


In [None]:
df_train_og = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
df_test_og  = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
submission  = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')


In [None]:
df_train_og.shape


In [None]:
df_train_og.head()


In [None]:
df_train_og.nunique()


In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df


In [None]:
df_train = reduce_mem_usage(df_train_og)
df_test = reduce_mem_usage(df_test_og)
del df_train_og
del df_test_og


In [None]:
cat_count = collections.Counter(df_train['Cover_Type'])
cat_freq = cat_count.values()
cat = cat_count.keys()
plt.bar(cat , cat_freq)

print(cat_count)


In [None]:
df_train = df_train[(df_train['Cover_Type'] != 4) & (df_train['Cover_Type'] != 5)]


In [None]:
rus = RandomUnderSampler(sampling_strategy = "not minority")
X  = df_train.drop(columns = ['Id' , 'Cover_Type','Soil_Type7' , 'Soil_Type15'])
y = df_train['Cover_Type']
X_res,y_res = rus.fit_resample(X,y)


In [None]:
cat_count = collections.Counter(y_res)
cat_freq = cat_count.values()
cat = cat_count.keys()
plt.bar(cat , cat_freq)

print(cat_count)


In [None]:
# X = X_res.drop(columns = ['Soil_Type7' , 'Soil_Type15'])
# y = y_res


In [None]:
from sklearn.feature_selection import SelectKBest,f_classif
selector = SelectKBest(f_classif,k="all")
fitter = selector.fit(X_res,y_res)
scores_df = pd.DataFrame(fitter.scores_) 
columns_df = pd.DataFrame(X_res.columns)
featurescores = pd.concat([scores_df ,columns_df] , axis=1)
featurescores.columns=['score','column name']
# featurescores.sort_values(by = 'score' , ascending=False)
plt.figure(figsize=(20,5))
plt.bar(featurescores['column name'] , featurescores['score'],width=0.4)
plt.xticks(rotation = 'vertical')
plt.plot()


In [None]:
featurescores = featurescores.sort_values(by = 'score' , ascending=False)


In [None]:
# useful_features = ['Elevation' ,'Wilderness_Area4' , 'Soil_Type10' , 'Wilderness_Area3' , 'Horizontal_Distance_To_Roadways' , 
#                'Wilderness_Area1' , 'Soil_Type39', 'Horizontal_Distance_To_Fire_Points' ,'Soil_Type38','Soil_Type40']

useful_features = featurescores['column name'].head(20)


In [None]:
useful_features


In [None]:
X_res = X_res[useful_features]


In [None]:
x_train,x_test,y_train,y_test = train_test_split(X_res,y_res,test_size = 0.2)


In [None]:
def objective_xgb(trial):
    xgb_params = {
        'learning_rate': 0.01,
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000, 100),
        'reg_lambda': trial.suggest_int('reg_lambda', 1, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0, step=0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10), 
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'gamma': trial.suggest_float('gamma', 0, 20)        
    }
    
    pipe = Pipeline(steps = [
    
    ('step1' , StandardScaler()),
    ('step2' , XGBClassifier(**xgb_params))
     ])
    
    pipe.fit(x_train,y_train)
    y_pred = pipe.predict(x_test)
    return accuracy_score(y_test,y_pred)


In [None]:
study_xgb= optuna.create_study(direction = 'maximize')
study_xgb.optimize(objective_xgb, n_trials=50)


In [None]:
best_params_xgb = study_xgb.best_params


In [None]:
pipe = Pipeline(steps = [
    
    ('step1' , StandardScaler()),
    ('step2' , XGBClassifier(**best_params_xgb))
     ])


In [None]:
pipe.fit(x_train,y_train)


In [None]:
df_test = df_test[useful_features]
Final_pred = pipe.predict(df_test)


In [None]:
submission['Cover_Type'] = Final_pred
submission.to_csv('Submission.csv' , index=False)
