In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

In [2]:
df_train = pd.read_csv("../../Data/train.csv")
df_train.head()

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88858 entries, 0 to 88857
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       88858 non-null  object 
 1   Estimated_Insects_Count  88858 non-null  int64  
 2   Crop_Type                88858 non-null  int64  
 3   Soil_Type                88858 non-null  int64  
 4   Pesticide_Use_Category   88858 non-null  int64  
 5   Number_Doses_Week        88858 non-null  int64  
 6   Number_Weeks_Used        79858 non-null  float64
 7   Number_Weeks_Quit        88858 non-null  int64  
 8   Season                   88858 non-null  int64  
 9   Crop_Damage              88858 non-null  int64  
dtypes: float64(1), int64(8), object(1)
memory usage: 6.8+ MB


## Checking Null Values

In [4]:
df_train.isnull().sum()

ID                            0
Estimated_Insects_Count       0
Crop_Type                     0
Soil_Type                     0
Pesticide_Use_Category        0
Number_Doses_Week             0
Number_Weeks_Used          9000
Number_Weeks_Quit             0
Season                        0
Crop_Damage                   0
dtype: int64

Here we found column Number_Weeks_Used with 9000 null values thus we removed it.



In [5]:
df_train.dropna(inplace=True)

In [6]:
df_train.isnull().sum()

ID                         0
Estimated_Insects_Count    0
Crop_Type                  0
Soil_Type                  0
Pesticide_Use_Category     0
Number_Doses_Week          0
Number_Weeks_Used          0
Number_Weeks_Quit          0
Season                     0
Crop_Damage                0
dtype: int64

Here ID column cntains non numerical value for each column and is unique to each row, thus it is also removed

In [7]:
df_train['ID'].value_counts()

ID
F00000001    1
F00103998    1
F00104008    1
F00104007    1
F00104006    1
            ..
F00051982    1
F00051978    1
F00051976    1
F00051974    1
F00155945    1
Name: count, Length: 79858, dtype: int64

In [8]:
X = df_train.loc[:, [column for column in df_train.columns if (column != 'ID') & (column != 'Crop_Damage')]]

In [12]:
df_train.drop(columns='ID', inplace=True)

In [13]:
df_train.corr()['Crop_Damage']

Estimated_Insects_Count    0.203796
Crop_Type                 -0.017097
Soil_Type                 -0.024569
Pesticide_Use_Category     0.176552
Number_Doses_Week         -0.030967
Number_Weeks_Used          0.232192
Number_Weeks_Quit         -0.133116
Season                     0.000634
Crop_Damage                1.000000
Name: Crop_Damage, dtype: float64

- Estimated_Insects_Count has a positive correlation (0.203796) with Crop_Damage. This suggests that as the when number of insects affecting crops increases the ratio of crop damage also increases.

- Crop_Type and Soil_Type have negative correlations (-0.017097 and -0.024569 respectively) with 'Crop_Damage'. This indicates a very weak negative relationship, suggesting that the type of crop and soil might not have a significant impact on crop damage.

- Pesticide_Use_Category has a positive correlation (0.176552) with Crop_Damage, implying that higher pesticide usage categories are associated with higher crop damage.

- Number_Doses_Week has a negative correlation (-0.030967) with Crop_Damage, suggesting that an increase in the number of pesticide doses per week might be weakly associated with lower crop damage.

- Number_Weeks_Used has a positive correlation (0.232192) with Crop_Damage, indicating that as the number of weeks a pesticide is used increases, so does the crop damage.

- Number_Weeks_Quit has a negative correlation (-0.133116) with Crop_Damage, suggesting that a longer period since quitting pesticide usage might be associated with lower crop damage.

- Season has a very close to zero correlation (0.000634) with Crop_Damage, indicating that the season might not have a significant impact on crop damage.

## Preparing Test Data

In [3]:
df_test = pd.read_csv("../../Data/test.csv")
df_test

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,F00000002,188,1,1,1,0,,0,2
1,F00000007,410,1,1,1,0,0.0,0,2
2,F00000011,626,1,0,1,0,0.0,0,2
3,F00000013,731,1,0,1,0,0.0,0,2
4,F00000014,789,0,0,1,0,0.0,0,1
...,...,...,...,...,...,...,...,...,...
59305,F00155937,3337,1,0,2,20,34.0,12,1
59306,F00155940,3516,1,0,2,20,32.0,10,2
59307,F00155941,3702,1,0,2,10,,48,1
59308,F00155943,3702,1,0,2,10,28.0,17,2


In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59310 entries, 0 to 59309
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       59310 non-null  object 
 1   Estimated_Insects_Count  59310 non-null  int64  
 2   Crop_Type                59310 non-null  int64  
 3   Soil_Type                59310 non-null  int64  
 4   Pesticide_Use_Category   59310 non-null  int64  
 5   Number_Doses_Week        59310 non-null  int64  
 6   Number_Weeks_Used        53417 non-null  float64
 7   Number_Weeks_Quit        59310 non-null  int64  
 8   Season                   59310 non-null  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 4.1+ MB


In [18]:
df_test.isnull().sum()

ID                            0
Estimated_Insects_Count       0
Crop_Type                     0
Soil_Type                     0
Pesticide_Use_Category        0
Number_Doses_Week             0
Number_Weeks_Used          5893
Number_Weeks_Quit             0
Season                        0
dtype: int64

In [19]:
df_test.dropna(inplace=True)

In [20]:
df_test.isnull().sum()

ID                         0
Estimated_Insects_Count    0
Crop_Type                  0
Soil_Type                  0
Pesticide_Use_Category     0
Number_Doses_Week          0
Number_Weeks_Used          0
Number_Weeks_Quit          0
Season                     0
dtype: int64

In [24]:
df_test.drop(columns='ID', inplace=True)

In [25]:
X.shape, df_test.shape

((79858, 8), (53417, 8))

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, df_train['Crop_Damage'], test_size=0.2, random_state=42)

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)

In [26]:
predictions = model.predict(df_test)

In [27]:
predictions

array([-0.10752878, -0.05311064, -0.03995621, ...,  0.38572551,
        0.36868846,  0.47906418])

## Hyperparameter Tuning using Ridge

In [34]:
parameters = {'alpha':[1,2,5,10,20,30,40,50,60.70,80,90]}
ridgeCV=GridSearchCV(Ridge(), parameters, scoring='neg_mean_squared_error', cv=5)
ridgeCV.fit(X, df_train['Crop_Damage'])

In [35]:
ridgeCV.best_params_

{'alpha': 90}

In [36]:
ridgeCV.best_score_

-0.18552184055520363

In [44]:
ridge_prediction=ridgeCV.predict(df_test)
ridge_prediction

array([-0.09896706, -0.04372596, -0.03044095, ...,  0.3893655 ,
        0.37196519,  0.48203287])

## Hyperparameter Tuning using Lasso

In [47]:
parameters = {'alpha':[1,2,5,10,20,30,40,50,60.70,80,90]}
LassoCV=GridSearchCV(Lasso(), parameters, scoring='neg_mean_squared_error', cv=5)
LassoCV.fit(X_train, y_train)

In [48]:
LassoCV.best_params_

{'alpha': 1}

In [49]:
LassoCV.best_score_

-0.1954296779193763

In [50]:
lasso_prediction= LassoCV.predict(df_test)
lasso_prediction

array([0.08270088, 0.10591971, 0.11720664, ..., 0.41657905, 0.43657304,
       0.4573195 ])