In [None]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Here I will be loading the data from the cvs files to dataframes

In [None]:
crops = pd.read_csv("... path to train.csv folder...")
crops_unknown = pd.read_csv("...path to test.csv folder...")

# Data pre-processing steps:
        1) Define the features and the labels and label them as x and y
        2) Fill the missing the data. Here I am using the data points before them to fill the missing data. 
        3) Then the features are transformed to decrease the skeweness of the features.
        3) Then the data is scaled so that all the features have the same importance.
        4) Then the data points are divided into the training set and evaluation set.
        

In [None]:
#1
x = crops.iloc[: , 1:9]
y = crops.iloc[: , 9]

#2
x = x.fillna(method = "bfill" , axis = 0)
x = np.sqrt(x)

#3
sc_x = StandardScaler()
x = sc_x.fit_transform(x)

#4
x_train , x_test , y_train , y_test = split(x , y , test_size = 0.2 , stratify = y)

# Managing the imbalanced nature of the data
        Here the training data is undersampled according to cluster centroids. 
        PROS:
            1) This makes the data more balanced.
            2) This makes the various clusters more dictinct and hence the classifier has a easier job.
        CONS:
            1) This will remove too many data points.

In [None]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids()

x_cc, y_cc = cc.fit_sample(x_train , y_train)

# Here the K-Nearest-Neighbor model will be defined and trained
        
GridSeachCV is going to be used for hyperparameter tuning

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {"n_neighbors" : [5] , 
              "leaf_size" : [1 , 3 , 5 , 7 , 9 , 10] , 
              "algorithm" : ["auto" , "kd_tree"] , 
              "n_jobs" : [-1]}

In [None]:
knn = KNeighborsClassifier()
model = GridSearchCV(knn , param_grid = parameters)

#Training
model.fit(x_cc , y_cc)

In [None]:
#Here we will be predicting the evaluation set
y_pred = model.predict(x_test)

# Evaluation of the model using various metrics

The accuracy will be lower, but the confusion matrix shows that the model was somewhat able to predict the minority   class.

In [None]:
# First we will be taking a look at the confusion matrix

from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test , y_pred)
print(confusion)

In [None]:
# Here we will be taking a look at the accuracy score of the model
print(model.score(x_test , y_test))

In [None]:
#Now I will be printing the classification report of the model
from sklearn.metrics import classification_report
print(classification_report(y_test , y_pred , digits = 3))

# Saving the model

In [None]:
import joblib

fileName = "KNN_with_cluster_undersampling.sav"
joblib.dump(model , fileName)

# Loading the model and predicting the results

In [None]:
loaded_model = joblib.load("KNN_with_cluster_undersampling.sav")

In [None]:
x_unknown = crops_unknown.iloc[: , 1:9]
crops_id = crops_unknown.iloc[: , 0]

In [None]:
x_unknown = x_unknown.fillna(method = "bfill" , axis = 0)
x_unknown = np.sqrt(x_unknown)


sc_x = StandardScaler()
x_unknown = sc_x.fit_transform(x_unknown)

In [None]:
y_unknown = loaded_model.predict(x_unknown)

In [None]:
submission = pd.DataFrame({'ID':crops_id, 'Crop_Damage':y_unknown})
submission.to_csv('Submission.csv',index=False)