In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler


In [2]:
train=pd.read_csv("Kaggle_Training_Dataset_v2.csv")
test=pd.read_csv("Kaggle_Test_Dataset_v2.csv")
train.shape,test.shape

((1687861, 23), (242076, 23))

In [13]:

def preprocess(df):
    #train['lead_time']=train['lead_time'].fillna(train['lead_time'].median())
    drop_cols=['forecast_6_month','forecast_9_month','perf_12_month_avg','sales_3_month','sales_6_month','sales_9_month','sku','rev_stop']
    df.drop(columns=drop_cols,inplace=True)
    cat_cols=['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy']
    num_cols=['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month', 'sales_1_month', 'min_bank', 'pieces_past_due', 'perf_6_month_avg', 'local_bo_qty']
    num_pipeline=Pipeline(
        steps=[
            ('imputer',SimpleImputer(strategy="median")),
            ('scaler',StandardScaler())
        ])
    potential_issue=['No', 'Yes']
    deck_risk=['No', 'Yes']
    oe_constraint=['No', 'Yes']
    ppap_risk=['No', 'Yes']
    stop_auto_buy=['Yes', 'No']
    rev_stop= ['No', 'Yes']
    cat_pipeline = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ('labelencoder',OrdinalEncoder(categories=[potential_issue,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop]))
        ]
    )
    preprocessor=ColumnTransformer([
        ('num_pipeline',num_pipeline,num_cols),
        ('cat_pipeline',cat_pipeline,cat_cols)
    ])
    new_df=preprocessor.fit_transform(df)
    return pd.DataFrame(new_df,columns=df.columns)


    

In [14]:
train.dropna(subset=['went_on_backorder'],inplace=True)
target='went_on_backorder'
new_train=preprocess(train.drop(columns=target))
y=train[target]

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
maps={"No":0,"Yes":1}
y.unique()

In [None]:
y=y.map(maps)
y.unique()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(new_train,y,test_size=0.2,random_state=1)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
over = SMOTE(sampling_strategy=0.1,random_state=1)
under = RandomUnderSampler(sampling_strategy=0.5,random_state=1)
x_smoted,y_smoted=over.fit_resample(X_train,y_train)
x_combined,y_combined=under.fit_resample(x_smoted,y_smoted)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf=RandomForestClassifier()

In [12]:
rf

In [17]:
np.linspace(1,1000,)

array([   1.,  112.,  223.,  334.,  445.,  556.,  667.,  778.,  889.,
       1000.])

In [23]:
params={
    'n_estimators':[10,20,30,60,100,150,200],
    'criterion':['gini','entopy','log_loss'],
    'max_depth':[10,20,30,40,50],
    'min_samples_split':[2,3,4,5,6],
    'min_samples_leaf':[1,2],
    'ccp_alpha':[0.01,0.1,0.5,0.7,1],
    'max_samples':[100,500,1000,1500,5000,6000,3000]
    
}

In [24]:
grid_search = RandomizedSearchCV(rf, params, cv=5,
 scoring='roc_auc',verbose=True)


In [25]:
grid_search.fit(x_combined,y_combined)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [27]:
grid_search.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_samples': 6000,
 'max_depth': 30,
 'criterion': 'gini',
 'ccp_alpha': 0.01}

In [29]:
preds=grid_search.best_estimator_.predict(X_test)

In [30]:
from sklearn.model_selection import cross_val_score

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train,preds))