# Linear model

In [2]:
!pip3 freeze > requirements.txt

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [49]:
X = df_train.drop('yield', axis=1)
y = df_train['yield']

In [50]:
df_train.drop('id', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [52]:
model = LinearRegression()

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
model.fit(X_train, y_train)

In [55]:
y_pred = model.predict(X_test)

In [56]:
np.sqrt(mean_squared_error(y_pred, y_test))

np.float64(392.4608278638083)

# Scaler

In [57]:
df = pd.read_csv('train.csv')

In [58]:
X = df.drop('yield', axis=1)
y = df['yield']

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression

In [60]:
model = LinearRegression()

In [61]:
scale = StandardScaler()

In [62]:
X = scale.fit_transform(X)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
model.fit(X_train, y_train)

In [65]:
y_pred = model.predict(X_test)

In [66]:
np.sqrt(mean_squared_error(y_pred, y_test))

np.float64(392.46082786380884)

# SGDRegression

In [90]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error

In [91]:
df = pd.read_csv('train.csv')

In [100]:
X = df.drop('yield', axis=1)
y = df['yield']

In [101]:
model = SGDRegressor()

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [103]:
model.fit(X_train, y_train)

In [104]:
y_pred = model.predict(X_test)

In [105]:
np.sqrt(mean_squared_error(y_pred, y_test))

np.float64(1.442708484805232e+16)

# SGDRegression + Scaler

In [106]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [107]:
df = pd.read_csv('train.csv')

In [108]:
X = df.drop('yield', axis=1)
y = df['yield']

In [109]:
model = SGDRegressor()

In [111]:
scale = StandardScaler()

In [112]:
X = scale.fit_transform(X)

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [115]:
model.fit(X_train, y_train)

In [116]:
y_pred = model.predict(X_test)

In [117]:
np.sqrt(mean_squared_error(y_pred, y_test))

np.float64(399.51584736449286)

# SGDRegression + Scaler + Pipeline

In [262]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [263]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [264]:
X = df_train.drop('yield', axis=1)
y = df_train['yield']

In [265]:
scale = StandardScaler()

In [266]:
pipe = Pipeline([('scale', scale), ('model', model)])

In [267]:
pipe

In [268]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [269]:
pipe.fit(X_train, y_train)

In [270]:
y_pred = pipe.predict(df_test)

In [271]:
df_sub = pd.read_csv('sample_submission.csv')

In [272]:
df_sub

Unnamed: 0,id,yield
0,15000,6010.71
1,15001,6010.71
2,15002,6010.71
3,15003,6010.71
4,15004,6010.71
...,...,...
9995,24995,6010.71
9996,24996,6010.71
9997,24997,6010.71
9998,24998,6010.71


In [273]:
df_sub['yield'] = y_pred

In [274]:
df_sub.to_csv('sub.csv', index=False, index_label=False)

In [185]:
# df = pd.read_csv('train.csv')
# df.drop('id', axis=1, inplace=True)

In [186]:
X = df.drop('yield', axis=1)
y = df['yield']

In [187]:
model = SGDRegressor()

In [188]:
scale = StandardScaler()

In [189]:
pipe = Pipeline([('scale', scale), ('model', model)])

In [190]:
pipe

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [192]:
pipe.fit(X_train, y_train)

In [193]:
y_pred = pipe.predict(X_test)

In [194]:
np.sqrt(mean_squared_error(y_pred, y_test))

np.float64(388.7910352139022)

# GridSearch + Scaler

In [146]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [147]:
df = pd.read_csv('train.csv')

In [148]:
X = df.drop('yield', axis=1)
y = df['yield']

In [149]:
model = SGDRegressor()

In [150]:
scale = StandardScaler()

In [151]:
X = scale.fit_transform(X)

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [153]:
params = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'alpha': [0.001, 0.01, 0.1, 1],
    'eta0': [0.001, 0.01, 0.1, 1],
    'max_iter': [100, 500, 1000],
    'early_stopping': [True, False]
}

In [154]:
final_model = GridSearchCV(model, param_grid=params, verbose=2, cv=3)

In [155]:
final_model.fit(X_train, y_train)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=l1; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=l1; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=l1; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.001, early_stopping=True, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alp



[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s




[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.1s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.1s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.1s
[CV] END alpha=0.001, early_stopping=False, eta0=0.001, max_iter=500, penalty=elastic



[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=l1; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=l1; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s




[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.1s
[CV] END alpha=0.01, early_stopping=False, eta0=0.001, max_iter=500, penalty=l



[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l1; total time=   0.1s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s




[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.0s
[CV] END alpha=0.1, early_stopping=False, eta0=0.001, max_iter=500, penalty=elasticnet; total time=   0



[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l1; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l1; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=l2; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=elasticnet; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=100, penalty=None; total time=   0.0s
[C



[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l1; total time=   0.1s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=l2; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=elasticnet; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=elasticnet; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=elasticnet; total time=   0.0s
[CV] END alpha=1, early_stopping=False, eta0=0.001, max_iter=500, penalty=None; total time=   0.1s
[CV]

In [156]:
final_model.best_params_

{'alpha': 0.01,
 'early_stopping': False,
 'eta0': 0.001,
 'max_iter': 1000,
 'penalty': 'l1'}

In [157]:
y_pred = final_model.predict(X_test)

In [158]:
np.sqrt(mean_squared_error(y_pred, y_test))

np.float64(388.1244850028929)

# GridSearchCV + Pipeline

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [22]:
df = pd.read_csv('train.csv')

In [23]:
X = df.drop('yield', axis=1)
y = df['yield']

In [24]:
model = SGDRegressor()

In [25]:
scale = StandardScaler()

In [26]:
pipe = Pipeline([('scale', scale), ('model', model)])

In [27]:
pipe

In [28]:
params = [
    {
        'model__penalty': ['l1', 'l2', None],
        'model__eta0': [0.001, 0.01, 0.1, 1],
        'model__alpha': [0.001, 0.01, 0.1, 1],
        'model__max_iter': [100, 500, 1000]
    },
    {
        'model__penalty': ['elasticnet'],
        'model__eta0': [0.001, 0.01, 0.1, 1],
        'model__alpha': [0.001, 0.01, 0.1, 1],
        'model__max_iter': [100, 500, 1000],
        'model__l1_ratio': [0.15, 0.5, 0.85]
    }
]

In [29]:
final_model = GridSearchCV(pipe, param_grid=params, verbose=2, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [31]:
final_model.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits




In [32]:
y_pred = final_model.predict(X_test)

In [33]:
np.sqrt(mean_squared_error(y_pred, y_test))

np.float64(396.5902168156471)

In [35]:
y.mean()

np.float64(6010.710065311333)