## Importing the Required Libraries

In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows',1000)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold,ShuffleSplit,GridSearchCV,StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from math import sqrt
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm


In [12]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train.head()

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1


In [13]:
# imputing all the null values with -1
train.fillna(-1, inplace=True)
test.fillna(-1,inplace=True)

In [14]:
train1 = train
test1 = test

In [15]:
# extracting the number from id column
train1['ID'] = train.apply(lambda x : int(x['ID'][1:]), axis = 1)
test1['ID'] = test.apply(lambda x : int(x['ID'][1:]), axis =1)
train1.head()

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,1,188,1,0,1,0,0.0,0,1,0
1,3,209,1,0,1,0,0.0,0,2,1
2,4,257,1,0,1,0,0.0,0,2,1
3,5,257,1,1,1,0,0.0,0,2,1
4,6,342,1,0,1,0,0.0,0,2,1


In [16]:
#Appending both the datasets to create a combined dataset
result = train1.append(test1, sort=False)
result = result.reset_index(drop=True)
# sort by ID to find out patterns 
result= result.sort_values(by=['ID'])

In [17]:
result

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,1,188,1,0,1,0,0.0,0,1,0.0
88858,2,188,1,1,1,0,-1.0,0,2,
1,3,209,1,0,1,0,0.0,0,2,1.0
2,4,257,1,0,1,0,0.0,0,2,1.0
3,5,257,1,1,1,0,0.0,0,2,1.0
...,...,...,...,...,...,...,...,...,...,...
148165,155941,3702,1,0,2,10,-1.0,48,1,
88856,155942,3702,1,0,2,10,25.0,18,3,0.0
148166,155943,3702,1,0,2,10,28.0,17,2,
148167,155944,3895,1,0,2,5,52.0,7,1,


In [18]:
# make the loop
# the features that we are targeting are - 
'''
1 - Group
2 - Group_Change 
3 - Group_First
4 - Group_Last
5 - Batch_Last
'''

# Initialization
result['Group'] = 0 
result['Group'][0] = 1
result['Group_Change'] = 0
result['Group_First'] = 0
result['Group_Last'] = 0
result['Batch_Last'] = 0
result['Group_First'][0] = 1
result['Soil_Change'] = 0
result

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,Group,Group_Change,Group_First,Group_Last,Batch_Last,Soil_Change
0,1,188,1,0,1,0,0.0,0,1,0.0,1,0,1,0,0,0
88858,2,188,1,1,1,0,-1.0,0,2,,0,0,0,0,0,0
1,3,209,1,0,1,0,0.0,0,2,1.0,0,0,0,0,0,0
2,4,257,1,0,1,0,0.0,0,2,1.0,0,0,0,0,0,0
3,5,257,1,1,1,0,0.0,0,2,1.0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148165,155941,3702,1,0,2,10,-1.0,48,1,,0,0,0,0,0,0
88856,155942,3702,1,0,2,10,25.0,18,3,0.0,0,0,0,0,0,0
148166,155943,3702,1,0,2,10,28.0,17,2,,0,0,0,0,0,0
148167,155944,3895,1,0,2,5,52.0,7,1,,0,0,0,0,0,0


In [21]:
col_len=result.shape[0]
check_for_period_of_28days_over= abs(result['Estimated_Insects_Count'][i] - result['Estimated_Insects_Count'][i-1]) > 1
#if the above condition is satisfied that means that the previous group has to be named and a new group begins from current i
# so accordingly labels are given

In '#2' line the condition checks whether the soil type is same,if it is same then they both will be in the same group ie having same group number or else different group number which is written in the second else statement.

In [22]:

#Start loop
for i in range(2,col_len):
    if (check_for_period_of_28days_over):
        result['Group'][i] = result['Group'][i-1] + 1
        result['Group_First'][i] = 1
        result['Group_Last'][i-1] = 1
        result['Batch_Last'][i-1] = 1
        result['Group_Num'] = 1
    else:
        if (result['Number_Doses_Week'][i] >= result['Number_Doses_Week'][i-1]):
            if (result['Soil_Type'][i] == result['Soil_Type'][i-1]):  #2
                result['Group'][i] = result['Group'][i-1]   #3
                result['Group_Num'][i] = result['Group_Num'][i-1] + 1 #4
            else:
                result['Group'][i] = result['Group'][i-1] + 1 #5
                result['Group_Num'] = 1 #6
        else:
            result['Group'][i] = result['Group'][i-1] + 1 #7
            result['Group_Change'][i] = 1  #8
            result['Group_First'][i] = 1   #9
            result['Group_Last'][i-1] = 1  #10
            result['Group'][i] = result['Group'][i-1] + 1 #11
    if (i%1000==0):
        print(i,'th iteration completed')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the doc

1000 th iteration completed
2000 th iteration completed
3000 th iteration completed
4000 th iteration completed
5000 th iteration completed
6000 th iteration completed
7000 th iteration completed
8000 th iteration completed
9000 th iteration completed
10000 th iteration completed
11000 th iteration completed
12000 th iteration completed
13000 th iteration completed
14000 th iteration completed
15000 th iteration completed
16000 th iteration completed
17000 th iteration completed
18000 th iteration completed
19000 th iteration completed
20000 th iteration completed
21000 th iteration completed
22000 th iteration completed
23000 th iteration completed
24000 th iteration completed
25000 th iteration completed
26000 th iteration completed
27000 th iteration completed
28000 th iteration completed
29000 th iteration completed
30000 th iteration completed
31000 th iteration completed
32000 th iteration completed
33000 th iteration completed
34000 th iteration completed
35000 th iteration comp

In [23]:
for i in range(1,(result.shape[0]-1)):
    if(result['Group_Last'][i] == 1 and result['Soil_Type'][i] != result['Soil_Type'][i+1]):
        result['Soil_Change'][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
#the above loop is to give values to the soil change column which is including the condition that the group change occurs
# simultaneously alongwith soil type change,if group change doesn't occur,then we dont give a fuck about the soil change
#this is completely my understanding of what feature engineering has been done.
result

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,Group,Group_Change,Group_First,Group_Last,Batch_Last,Soil_Change,Group_Num
0,1,188,1,0,1,0,0.0,0,1,0.0,1,0,1,0,0,0,1
88858,2,188,1,1,1,0,-1.0,0,2,,17941,1,1,1,1,0,1
1,3,209,1,0,1,0,0.0,0,2,1.0,0,0,0,1,1,0,1
2,4,257,1,0,1,0,0.0,0,2,1.0,0,0,1,0,0,0,1
3,5,257,1,1,1,0,0.0,0,2,1.0,1,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148165,155941,3702,1,0,2,10,-1.0,48,1,,32398,1,1,0,0,0,1
88856,155942,3702,1,0,2,10,25.0,18,3,0.0,17940,1,1,1,1,0,1
148166,155943,3702,1,0,2,10,28.0,17,2,,32398,0,0,1,1,0,2
148167,155944,3895,1,0,2,5,52.0,7,1,,32399,1,1,0,0,0,1


In [25]:
#getting our test and train sets back
testRedifined = result.loc[result['Crop_Damage'].isnull()]
trainRedifined = result.dropna()
testRedifined = testRedifined.drop(columns=['ID','Crop_Damage']).reset_index(drop=True)
trainRedifined = trainRedifined.drop(columns=['ID']).reset_index(drop=True)

In [26]:
trainRedifined

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,Group,Group_Change,Group_First,Group_Last,Batch_Last,Soil_Change,Group_Num
0,188,1,0,1,0,0.0,0,1,0.0,1,0,1,0,0,0,1
1,209,1,0,1,0,0.0,0,2,1.0,0,0,0,1,1,0,1
2,257,1,0,1,0,0.0,0,2,1.0,0,0,1,0,0,0,1
3,257,1,1,1,0,0.0,0,2,1.0,1,0,0,1,1,1,1
4,342,1,0,1,0,0.0,0,2,1.0,2,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88853,3337,1,0,2,10,12.0,44,3,0.0,17939,1,1,1,1,0,1
88854,3516,1,0,2,10,20.0,38,1,0.0,17939,0,1,0,0,0,1
88855,3516,1,0,2,15,40.0,8,2,0.0,17939,0,0,1,1,0,1
88856,3702,1,0,2,10,25.0,18,3,0.0,17940,1,1,1,1,0,1


In [27]:
from pycaret.classification import * 

In [29]:
exp = setup(data = trainRedifined, target = 'Crop_Damage', session_id=1,
                  normalize = True,
                categorical_features = ['Crop_Type','Soil_Type','Pesticide_Use_Category','Season'],
                numeric_features=['Estimated_Insects_Count','Number_Doses_Week','Number_Weeks_Quit','Number_Weeks_Used'],
                categorical_imputation='mode',
                numeric_imputation='mean',
            remove_outliers=True,
            outliers_threshold=0.1,
            normalize_method='robust',
            feature_selection=True,
            feature_selection_threshold=0.9,
            remove_multicollinearity=True,
            
                )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,1
1,Target Type,Multiclass
2,Label Encoded,
3,Original Data,"(88858, 16)"
4,Missing Values,False
5,Numeric Features,5
6,Categorical Features,10
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [None]:
compare_models()

Catboost was taking a long time so left it out because our purpose was satisfied with lightgbm.

#### See the power of Pycaret woh ladke ne Data scaling ke baad sab kuch jo kiya hain na woh main 5 line main kiya and with everything K fold cv and also robust scaling and with multiple comparisons and with lightgbm and xgboost.

In [31]:
lgbm=create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9146,0.0,0.4876,0.9004,0.9046,0.5057,0.5182
1,0.9234,0.0,0.5223,0.9125,0.913,0.5397,0.5616
2,0.9175,0.0,0.4966,0.9044,0.9067,0.5106,0.5281
3,0.9173,0.0,0.5129,0.9052,0.9091,0.5296,0.5402
4,0.9216,0.0,0.523,0.9124,0.9123,0.5418,0.5574
5,0.9255,0.0,0.5161,0.9138,0.9173,0.5756,0.5872
6,0.9194,0.0,0.4912,0.9064,0.9079,0.5158,0.537
7,0.9171,0.0,0.4952,0.9032,0.9064,0.5084,0.5264
8,0.9214,0.0,0.5013,0.9088,0.9106,0.5298,0.5505
9,0.9189,0.0,0.5107,0.9064,0.9099,0.5312,0.5446


In [32]:
tuned_lgbm=tune_model(lgbm,optimize='Accuracy')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9148,0.0,0.4767,0.8972,0.9036,0.4993,0.5144
1,0.923,0.0,0.5097,0.9119,0.9123,0.5373,0.5591
2,0.9171,0.0,0.4776,0.9003,0.9058,0.5083,0.5257
3,0.9184,0.0,0.5038,0.9058,0.9095,0.5337,0.5448
4,0.9214,0.0,0.5045,0.9146,0.9116,0.5413,0.5565
5,0.9275,0.0,0.5273,0.918,0.9195,0.5873,0.5988
6,0.9187,0.0,0.4851,0.9043,0.9071,0.5123,0.5329
7,0.92,0.0,0.5126,0.91,0.9102,0.5312,0.5471
8,0.9219,0.0,0.4913,0.9082,0.9108,0.5339,0.5541
9,0.9196,0.0,0.5077,0.9072,0.9103,0.535,0.5485


# Now we are going to ensemble lightgbm with xgboost.

In [33]:
bagged_lightgbm = ensemble_model(tuned_lgbm)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9143,0.0,0.481,0.8973,0.9039,0.5035,0.5159
1,0.923,0.0,0.4973,0.913,0.9118,0.5363,0.5584
2,0.9173,0.0,0.4826,0.9021,0.9059,0.507,0.5255
3,0.9202,0.0,0.499,0.9071,0.9108,0.5404,0.553
4,0.9219,0.0,0.5022,0.915,0.9116,0.5395,0.557
5,0.9266,0.0,0.517,0.916,0.9179,0.5778,0.5911
6,0.9159,0.0,0.4785,0.9007,0.9037,0.4921,0.5134
7,0.9202,0.0,0.5038,0.9093,0.9098,0.5284,0.5459
8,0.9216,0.0,0.4844,0.9051,0.9099,0.5292,0.5505
9,0.9203,0.0,0.5041,0.9084,0.9109,0.5397,0.553


In [34]:
boosted_svm = ensemble_model(tuned_lgbm,method='Boosting')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8864,0.0,0.3333,0.7857,0.833,0.0,0.0
1,0.8864,0.0,0.3333,0.7857,0.833,0.0,0.0
2,0.8864,0.0,0.3333,0.7857,0.833,0.0,0.0
3,0.8862,0.0,0.3333,0.7854,0.8327,0.0,0.0
4,0.8862,0.0,0.3333,0.7854,0.8327,0.0,0.0
5,0.8862,0.0,0.3333,0.7854,0.8327,0.0,0.0
6,0.8862,0.0,0.3333,0.7854,0.8327,0.0,0.0
7,0.8862,0.0,0.3333,0.7854,0.8327,0.0,0.0
8,0.8862,0.0,0.3333,0.7854,0.8327,0.0,0.0
9,0.8862,0.0,0.3333,0.7854,0.8327,0.0,0.0


In [36]:
unseen_predictions = predict_model(tuned_lgbm, data=testRedifined)
unseen_predictions.head()

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Group,Group_Change,Group_First,Group_Last,Batch_Last,Soil_Change,Group_Num,Label
0,188,1,1,1,0,-1.0,0,2,17941,1,1,1,1,0,1,0
1,410,1,1,1,0,0.0,0,2,17941,0,1,1,1,1,1,0
2,626,1,0,1,0,0.0,0,2,17942,0,1,1,1,0,1,0
3,731,1,0,1,0,0.0,0,2,17942,0,1,1,1,0,1,0
4,789,0,0,1,0,0.0,0,1,17942,0,1,1,1,1,1,1


In [37]:
#label is your predicted value
unseen_predictions.to_csv('90%submission_file.csv')