In [None]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Here I will be loading the data from the cvs files to dataframes

In [None]:
crops = pd.read_csv("...path to train.csv..."")
crops_unknown = pd.read_csv("...path to test.csv...")

# Data pre-processing steps:
        1) Define the features and the labels and label them as x and y
        2) Fill the missing the data. Here I am using the data points before them to fill the missing data. 
        3) Then the features are transformed to decrease the skeweness of the features.
        3) Then the data is scaled so that all the features have the same importance.
        4) Then the data points are divided into the training set and evaluation set.
        

In [None]:
#1
x = crops.iloc[: , 1:9]
y = crops.iloc[: , 9]

#2
x = x.fillna(method = "bfill" , axis = 0)
x = np.sqrt(x)

#3
sc_x = StandardScaler()
x = sc_x.fit_transform(x)

#4
x_train , x_test , y_train , y_test = split(x , y , test_size = 0.2 , stratify = y)

# Managing the imbalanced nature of the data
Here I am not going to resample the data.

Instead of that, I am going to create sample weights, which will give particular weights to various data points of different classes.

In [None]:
def CreateBalancedSampleWeights(y_train, largest_class_weight_coef):
    classes = y_train.unique()
    classes.sort()
    class_samples = np.bincount(y_train)
    total_samples = class_samples.sum()
    n_classes = len(class_samples)
    weights = total_samples / (n_classes * class_samples * 1.0)
    class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
    class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
    sample_weights = [class_weight_dict[y] for y in y_train]

    return sample_weights

# Training

In [None]:
train_sample_weight = CreateBalancedSampleWeights(y_train, largest_class_weight_coef=0.2)
xgb_clf = XGBClassifier(
            n_estimators = 3000 , 
            max_depth = 20 , 
            gpu_id = 0 , 
            booster = "gbtree" , 
            gamma = 1 
            
            )
xgb_clf.fit(x_train, y_train, sample_weight=train_sample_weight, eval_metric=accuracy_score)


# Prediction and Evaluation

In [None]:
y_pred = xgb_clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score as acc_score
accuracy = acc_score(y_test , y_pred)
print(accuracy)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test , y_pred , digits = 3))

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test , y_pred)
print(confusion)

# Saving the model

In [None]:
import joblib

fileName = "XGB_with_sampleWeights.sav"
joblib.dump(model , fileName)

# Loading the model and predicting the results

In [None]:
x_unknown = crops_unknown.iloc[: , 1:9]
crops_id = crops_unknown.iloc[: , 0]

x_unknown = x_unknown.fillna(method = "bfill" , axis = 0)
x_unknown = np.sqrt(x_unknown)


sc_x = StandardScaler()
x_unknown = sc_x.fit_transform(x_unknown)


y_unknown = xgb_clf.predict(x_unknown)

submission = pd.DataFrame({'ID':crops_id, 'Crop_Damage':y_unknown})
submission.to_csv('Submission.csv',index=False)