# DATA MODELING

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
url = "https://raw.githubusercontent.com/SIDDHARTHA2301/DATA/master/d1.csv"
df = pd.read_csv(url)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Group,Contour,Depth,Gp,Block,pH,N,Dens,P,Ca,Mg,K,Na,Conduc
0,1,1,Top,0-10,T0,1,5.4,0.188,0.92,215,16.35,7.65,0.72,1.14,1.09
1,2,1,Top,0-10,T0,2,5.65,0.165,1.04,208,12.25,5.15,0.71,0.94,1.35
2,3,1,Top,0-10,T0,3,5.14,0.26,0.95,300,13.02,5.68,0.68,0.6,1.41
3,4,1,Top,0-10,T0,4,5.14,0.169,1.1,248,11.92,7.88,1.09,1.01,1.64
4,5,2,Top,10-30,T1,1,5.14,0.164,1.12,174,14.17,8.12,0.7,2.17,1.85


In [5]:
df.set_index('Unnamed: 0',inplace = True)

In [6]:
data = df[['P','Ca','Mg','K','Na','pH']]

In [7]:
data.head()

Unnamed: 0_level_0,P,Ca,Mg,K,Na,pH
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,215,16.35,7.65,0.72,1.14,5.4
2,208,12.25,5.15,0.71,0.94,5.65
3,300,13.02,5.68,0.68,0.6,5.14
4,248,11.92,7.88,1.09,1.01,5.14
5,174,14.17,8.12,0.7,2.17,5.14


In [8]:
data.corr()['pH']

P     0.591030
Ca    0.808629
Mg   -0.395782
K     0.579573
Na   -0.693261
pH    1.000000
Name: pH, dtype: float64

In [9]:
msk = np.random.rand(len(df)) < 0.8
train = data[msk]
test = data[~msk]

In [10]:
train.shape

(40, 6)

# MULTIPLE LINEAR REGRESSION 

In [14]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
x = np.asanyarray(train[['P','Ca','Mg','K','Na']])
y = np.asanyarray(train[['pH']])
regr.fit (x, y)
# The coefficients
print ('Coefficients: ', regr.coef_)

Coefficients:  [[ 0.00022803  0.1560726  -0.02185034 -0.13791058 -0.0164205 ]]


In [15]:
y_hat= regr.predict(test[['P','Ca','Mg','K','Na']])
x_test = np.asanyarray(test[['P','Ca','Mg','K','Na']])
y_test = np.asanyarray(test[['pH']])
print("Residual sum of squares: %.2f"
      % np.mean((y_hat - y_test) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score of Testing Set: %.2f' % regr.score(x_test, y_test))


Residual sum of squares: 0.09
Variance score of Testing Set: 0.69


# POLYNOMIAL REGRESSION WITH DEGREE - 2

In [16]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
train_x_poly = poly.fit_transform(x)

clf = linear_model.LinearRegression()
train_y_ = clf.fit(train_x_poly, y)
# The coefficients
print ('Coefficients: ', clf.coef_)
print ('Intercept: ',clf.intercept_)

Coefficients:  [[ 0.00000000e+00  1.26626283e-02 -8.13113702e-01 -7.14064848e-01
   1.29306746e+01 -3.38981774e-01 -2.77865519e-05  4.05475754e-04
  -8.26424988e-04  6.08936745e-03  2.98327315e-04  1.45761742e-02
   5.69318579e-02 -2.41050788e-01  6.07689953e-02  4.99627993e-02
  -8.31363618e-01 -9.74284930e-03 -2.89768793e+00 -5.34595508e-01
   1.32428799e-02]]
Intercept:  [7.01060802]


In [17]:
from sklearn.metrics import r2_score

test_x_poly = poly.fit_transform(x_test)
test_y_ = clf.predict(test_x_poly)

print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - y_test) ** 2))
print("R2-score: %.2f" % r2_score(test_y_ , y_test) )

Mean absolute error: 0.23
Residual sum of squares (MSE): 0.10
R2-score: 0.71


In [107]:
train_y_ = clf.predict(train_x_poly)

print("Mean absolute error: %.2f" % np.mean(np.absolute(train_y_ - y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((train_y_ - y) ** 2))
print("R2-score: %.2f" % r2_score(test_y_ , y) )

Mean absolute error: 0.08
Residual sum of squares (MSE): 0.01
R2-score: 0.96


# POLYNOMIAL REGRESSION WITH DEGREE - 3

In [20]:
from sklearn.preprocessing import PolynomialFeatures

poly3 = PolynomialFeatures(degree=3)
train_x_poly3 = poly3.fit_transform(x)

clf3 = linear_model.LinearRegression()
train_y_3 = clf3.fit(train_x_poly3, y)

train_y_3 = clf3.predict(train_x_poly3)


#train_y_3 = clf.predict(train_x_poly)

print("Mean absolute error: %.2f" % np.mean(np.absolute(train_y_3 - y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((train_y_3 - y) ** 2))
print("R2-score: %.2f" % r2_score(train_y_3 , y) )

Mean absolute error: 0.00
Residual sum of squares (MSE): 0.00
R2-score: 1.00


In [21]:
from sklearn.metrics import r2_score

test_x_poly3 = poly3.fit_transform(x_test)
test_y_3 = clf3.predict(test_x_poly3)

print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_3 - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_3 - y_test) ** 2))
print("R2-score: %.2f" % r2_score(test_y_3 , y_test) )

Mean absolute error: 1.26
Residual sum of squares (MSE): 3.72
R2-score: -0.46


# CONCLUSION 

# 1. Multiple linear regression - 70
# 2. polynomial regression (2) - 96
# 3. polynomial regression (3) -  overfitting

###  Polynomial regression with degree is best fit