In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')

In [3]:
train

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [4]:
categorical = ['Soil Type', 'Crop Type']
numerical = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

In [5]:
from sklearn.preprocessing import OneHotEncoder
    
def preprocess_df(df, categorical, numerical, mode = 'train'):
    df = df.copy()
    df.ffill(inplace=True)

    for column in categorical:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        transformed = encoder.fit_transform(df[[column]])

        # Make column names like "Soil Type_Clay", "Soil Type_Sandy"
        col_names = [f"{column}_{cat}" for cat in encoder.categories_[0]]
        one_hot_df = pd.DataFrame(transformed, columns=col_names, index=df.index)

        df = pd.concat([df.drop(columns=[column]), one_hot_df], axis=1)
    if mode == 'train':
        fertilizer_names = {
            '28-28': 0,
            '17-17-17': 1,
            '10-26-26': 2,
            'DAP': 3,
            '20-20': 4,
            '14-35-14': 5,
            'Urea': 6
            }
        df['Fertilizer Name'] = df['Fertilizer Name'].map(fertilizer_names)

    return df

In [6]:
train = preprocess_df(train, categorical, numerical)



In [7]:
train

Unnamed: 0,id,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous,Fertilizer Name,Soil Type_Black,Soil Type_Clayey,...,Crop Type_Cotton,Crop Type_Ground Nuts,Crop Type_Maize,Crop Type_Millets,Crop Type_Oil seeds,Crop Type_Paddy,Crop Type_Pulses,Crop Type_Sugarcane,Crop Type_Tobacco,Crop Type_Wheat
0,0,37,70,36,36,4,5,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,27,69,65,30,6,18,0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,29,63,32,24,12,16,1,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,35,62,54,39,12,4,2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,35,58,43,37,2,16,3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,8,16,6,0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
749996,749996,37,64,58,38,8,20,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
749997,749997,35,68,59,6,11,29,2,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
749998,749998,31,68,29,9,11,12,4,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
x_train = train.drop(columns = ['id', 'Fertilizer Name'])
y_train = train['Fertilizer Name']

In [9]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators=100,
                          max_depth=2,
                          learning_rate=1)

model.fit(x_train, y_train)

In [10]:
test = preprocess_df(test, categorical, numerical, mode = 'test')



In [11]:
x_test = test.drop(columns = ['id'])
test['preds'] = model.predict(x_test)

In [12]:
fertilizer_numbers = {
    0 : '28-28',
    1 : '17-17-17',
    2 : '10-26-26',
    3 : 'DAP',
    4 : '20-20',
    5 : '14-35-14',
    6 : 'Urea'
}
test['preds'] = test['preds'].map(fertilizer_numbers)

In [13]:
proba = model.predict_proba(x_test)

In [14]:
top3 = np.argsort(proba, axis = 1)[:, -3:][:, ::-1]

In [15]:
mapped_top3 = np.array([[fertilizer_numbers[i] for i in row] for row in top3])

In [16]:
predictions = []
for row in mapped_top3:
    pred = ''
    for string in row:
        pred = pred + string + ' '
    predictions.append(pred)

In [17]:
test['preds'] = predictions

In [18]:
test

Unnamed: 0,id,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous,Soil Type_Black,Soil Type_Clayey,Soil Type_Loamy,...,Crop Type_Ground Nuts,Crop Type_Maize,Crop Type_Millets,Crop Type_Oil seeds,Crop Type_Paddy,Crop Type_Pulses,Crop Type_Sugarcane,Crop Type_Tobacco,Crop Type_Wheat,preds
0,750000,31,70,52,34,11,24,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,14-35-14 28-28 DAP
1,750001,27,62,45,30,14,15,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17-17-17 20-20 10-26-26
2,750002,28,72,28,14,15,4,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10-26-26 20-20 14-35-14
3,750003,37,53,57,18,17,36,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14-35-14 17-17-17 10-26-26
4,750004,31,55,32,13,19,14,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,20-20 28-28 10-26-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,999995,26,66,30,14,7,18,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14-35-14 17-17-17 20-20
249996,999996,33,62,55,28,14,7,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,14-35-14 20-20 10-26-26
249997,999997,36,53,64,28,11,27,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,14-35-14 DAP Urea
249998,999998,36,67,26,33,0,10,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,DAP 10-26-26 28-28


In [19]:
submission = pd.DataFrame({
    'id' : test['id'],
    'Fertilizer Name' : test['preds']
})

In [20]:
submission.to_csv('submission.csv', index = False)