In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore") 

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train.drop('id', axis = 1, inplace = True)
df_test.drop('id', axis = 1, inplace = True)

# SGDRegressor + StandardScaler + Pipeline

In [5]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [6]:
X_train = df_train.drop('yield', axis = 1)
X_test = df_test.copy()
y_train = df_train['yield']

In [7]:
scaler = StandardScaler()

In [8]:
model = SGDRegressor()

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
from sklearn.pipeline import Pipeline

In [11]:
pipe = Pipeline([('scaler', scaler), ('model', model)])

In [12]:
params = {
    'model__penalty': ['l2', 'l1', 'elasticnet', None],
    'model__alpha': [0.001, 0.01, 0.1,],
    'model__max_iter': [10, 100, 1000],
    'model__eta0': [0.001, 0.01, 0.1,]
}

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
final_model = GridSearchCV(pipe, param_grid=params, cv=3, n_jobs=-1)

In [None]:
final_model.fit(X_train, y_train)

In [None]:
final_model.best_params_

In [None]:
final_model.best_score_

In [None]:
y_pred = final_model.predict(X_val)

In [None]:
mean_absolute_error(y_val,y_pred)

In [None]:
np.sqrt(mean_squared_error(y_pred, y_val))

In [None]:
finaly_y = final_model.predict(X_test)
df_sub = pd.read_csv('sample_submission.csv')
df_sub['yield'] = finaly_y
df_sub.to_csv('sub.csv',  index = False, index_label=False)

In [None]:
df_sub

# RandomForestRegressor

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [6]:
X_train = df_train.drop('yield', axis = 1)
X_test = df_test.copy()
y_train = df_train['yield']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [8]:
model = RandomForestRegressor()

In [9]:
params = {
    'n_estimators': [100, 150],
    'criterion': ["squared_error","absolute_error"],
    'max_depth': [2, 4, None]
}

In [10]:
final_model = GridSearchCV(model, param_grid=params, cv=3, verbose=2)

In [11]:
final_model.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END criterion=squared_error, max_depth=2, n_estimators=100; total time=   1.4s
[CV] END criterion=squared_error, max_depth=2, n_estimators=100; total time=   1.1s
[CV] END criterion=squared_error, max_depth=2, n_estimators=100; total time=   1.2s
[CV] END criterion=squared_error, max_depth=2, n_estimators=150; total time=   1.8s
[CV] END criterion=squared_error, max_depth=2, n_estimators=150; total time=   1.8s
[CV] END criterion=squared_error, max_depth=2, n_estimators=150; total time=   1.8s
[CV] END criterion=squared_error, max_depth=4, n_estimators=100; total time=   2.2s
[CV] END criterion=squared_error, max_depth=4, n_estimators=100; total time=   2.1s
[CV] END criterion=squared_error, max_depth=4, n_estimators=100; total time=   2.1s
[CV] END criterion=squared_error, max_depth=4, n_estimators=150; total time=   3.1s
[CV] END criterion=squared_error, max_depth=4, n_estimators=150; total time=   3.1s
[CV] END criter

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'criterion': ['squared_error', 'absolute_error'], 'max_depth': [2, 4, ...], 'n_estimators': [100, 150]}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,150
,criterion,'absolute_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
y_pred = final_model.predict(X_val)

In [15]:
mean_absolute_error(y_val,y_pred)

257.7831711332445

In [16]:
np.sqrt(mean_squared_error(y_pred, y_val))

np.float64(374.7373801082046)

In [18]:
finaly_y = final_model.predict(X_test)
df_sub = pd.read_csv('sample_submission.csv')
df_sub['yield'] = finaly_y
df_sub.to_csv('sub1.csv',  index = False, index_label=False)

# XGBRegressor + GridSearchCV

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [6]:
X_train = df_train.drop('yield', axis = 1)
X_test = df_test.copy()
y_train = df_train['yield']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [9]:
model = XGBRegressor()

In [10]:
params = {
    'n_estimators': [50, 100, 150, 200],
}

In [11]:
final_model = GridSearchCV(model, param_grid=params, cv=3)

In [12]:
final_model.fit(X_train, y_train)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,param_grid,"{'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [13]:
y_pred = final_model.predict(X_val)

In [14]:
np.sqrt(mean_squared_error(y_pred, y_val))

np.float64(373.27019201715285)

In [15]:
mean_absolute_error(y_val,y_pred)

257.4069602624479

In [16]:
finaly_y = final_model.predict(X_test)
df_sub = pd.read_csv('sample_submission.csv')
df_sub['yield'] = finaly_y
df_sub.to_csv('sub2.csv',  index = False, index_label=False)