## Loading the dataset and preparing it

In [70]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

First we will import the data as a DataFrame

In [71]:
rice = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/Rice.csv")
rice.head()

Unnamed: 0,State_Name,ind_district,Crop_Year,Season,Crop,Area,Production,Value
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0,3.147059
1,Andaman and Nicobar Islands,NICOBARS,2001,Kharif,Rice,83.0,300.0,3.614458
2,Andaman and Nicobar Islands,NICOBARS,2002,Kharif,Rice,189.2,510.84,2.7
3,Andaman and Nicobar Islands,NICOBARS,2003,Kharif,Rice,52.0,90.17,1.734038
4,Andaman and Nicobar Islands,NICOBARS,2004,Kharif,Rice,52.94,72.57,1.370797


In [72]:
rice_haryana = rice[rice["State_Name"]=="Haryana"]
rice_haryana.head()

Unnamed: 0,State_Name,ind_district,Crop_Year,Season,Crop,Area,Production,Value
4210,Haryana,AMBALA,1997,Kharif,Rice,65000.0,182000.0,2.8
4211,Haryana,AMBALA,1998,Kharif,Rice,71365.0,186000.0,2.60632
4212,Haryana,AMBALA,1999,Kharif,Rice,72185.0,206000.0,2.853778
4213,Haryana,AMBALA,2000,Kharif,Rice,71840.0,217000.0,3.020601
4214,Haryana,AMBALA,2001,Kharif,Rice,74881.0,233000.0,3.111604


In [73]:
rainfall = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/rainfall.csv")
rainfall.head()

Unnamed: 0,State,District,Year,Value
0,Andhra Pradesh,Adilabad,1994,1199.447
1,Andhra Pradesh,Adilabad,1995,1255.561
2,Andhra Pradesh,Adilabad,1996,1081.171
3,Andhra Pradesh,Adilabad,1997,905.718
4,Andhra Pradesh,Adilabad,1998,1128.95


In [74]:
rain_haryana = rainfall[rainfall["State"]=="Haryana"]
print(rain_haryana.head())
print(rain_haryana.describe())

        State District  Year    Value
1179  Haryana   Ambala  1994  620.808
1180  Haryana   Ambala  1995  832.320
1181  Haryana   Ambala  1996  784.208
1182  Haryana   Ambala  1997  784.650
1183  Haryana   Ambala  1998  649.086
              Year       Value
count   171.000000  171.000000
mean   1998.000000  531.810170
std       2.589572  165.230096
min    1994.000000  166.299000
25%    1996.000000  420.334500
50%    1998.000000  531.747000
75%    2000.000000  646.928500
max    2002.000000  895.146000


In [75]:
X_hr = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/haryana.csv")
X_hr.head()

Unnamed: 0,ind_district,Crop_Year,Y,X1,X2,X3,X4
0,AMBALA,1997,182000.0,,,784.65,784.208
1,AMBALA,1998,186000.0,182000.0,,649.086,784.65
2,AMBALA,1999,206000.0,186000.0,182000.0,396.134,649.086
3,AMBALA,2000,217000.0,206000.0,186000.0,593.737,396.134
4,AMBALA,2001,233000.0,217000.0,206000.0,469.118,593.737


In [76]:
X_finite = X_hr[np.isfinite(X_hr["X1"])]
X_finite = X_finite[np.isfinite(X_finite["X2"])]
X_finite = X_finite[np.isfinite(X_finite["X3"])]
X_finite = X_finite[np.isfinite(X_finite["X4"])]
X_finite = X_finite[np.isfinite(X_finite["Y"])]
X_finite.head()

Unnamed: 0,ind_district,Crop_Year,Y,X1,X2,X3,X4
2,AMBALA,1999,206000.0,186000.0,182000.0,396.134,649.086
3,AMBALA,2000,217000.0,206000.0,186000.0,593.737,396.134
4,AMBALA,2001,233000.0,217000.0,206000.0,469.118,593.737
5,AMBALA,2002,183000.0,233000.0,217000.0,476.752,469.118
8,AMBALA,2005,254000.0,233000.0,227000.0,1058.4,1202.9


In [77]:
Xn = X_finite
Xn.describe()

Unnamed: 0,Crop_Year,Y,X1,X2,X3,X4
count,127.0,127.0,127.0,127.0,127.0,127.0
mean,2004.204724,182771.653543,181007.874016,174322.834646,489.894449,502.577819
std,3.733883,147264.161147,147414.085609,142965.856835,211.25555,190.321342
min,1999.0,3000.0,2000.0,2000.0,126.3,126.3
25%,2001.0,49000.0,52500.0,55500.0,342.916,362.2115
50%,2005.0,166000.0,164000.0,160000.0,458.6,486.38
75%,2007.0,251500.0,240500.0,227000.0,573.4915,605.3015
max,2010.0,610000.0,610000.0,610000.0,1433.9,1227.9


In [78]:
y = Xn["Y"]
X = Xn[["X1", "X2", "X3", "X4"]]
X.head()

Unnamed: 0,X1,X2,X3,X4
2,186000.0,182000.0,396.134,649.086
3,206000.0,186000.0,593.737,396.134
4,217000.0,206000.0,469.118,593.737
5,233000.0,217000.0,476.752,469.118
8,233000.0,227000.0,1058.4,1202.9


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [80]:
alg = LinearRegression()
alg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [81]:
alg.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'normalize': False}

In [82]:
alg.score(X_test, y_test)

0.960019327673997

In [83]:
y_predict = alg.predict(X_test)
print(y_predict)

[ 190794.71627631  217357.86281974  430244.11225409   34235.77445793
  173184.31231535  360274.88117369  167266.09021242  165546.55770286
  152646.11504628  118061.29497398   29735.9087492   277750.05318838
  230629.83462277  105900.85813017  229157.1298069   166327.39577958
  433826.46108864  193360.2502063   227889.65362064  215576.84026052
  380778.16929633  175204.90646006  148388.15085371  216826.50794372
   19538.54418833   54035.97031606]


In [84]:
print(y_test)

244    200000.0
84     222000.0
138    460000.0
193      8000.0
214    189000.0
99     391000.0
241    151000.0
213    175000.0
238    160000.0
76      94000.0
196     36000.0
93     284000.0
248    267000.0
67      63000.0
82     205000.0
240    161000.0
141    383000.0
216    204000.0
247    221000.0
4      233000.0
100    402000.0
230    192000.0
229    178000.0
88     258000.0
61      14000.0
200     50000.0
Name: Y, dtype: float64


In [85]:
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)

23298.873778861078


In [86]:
# print(type(y_test))
# print(type(y_predict))
yt = y_test.as_matrix()
print(type(yt))
print(yt)
y_test

<class 'numpy.ndarray'>
[ 200000.  222000.  460000.    8000.  189000.  391000.  151000.  175000.
  160000.   94000.   36000.  284000.  267000.   63000.  205000.  161000.
  383000.  204000.  221000.  233000.  402000.  192000.  178000.  258000.
   14000.   50000.]


244    200000.0
84     222000.0
138    460000.0
193      8000.0
214    189000.0
99     391000.0
241    151000.0
213    175000.0
238    160000.0
76      94000.0
196     36000.0
93     284000.0
248    267000.0
67      63000.0
82     205000.0
240    161000.0
141    383000.0
216    204000.0
247    221000.0
4      233000.0
100    402000.0
230    192000.0
229    178000.0
88     258000.0
61      14000.0
200     50000.0
Name: Y, dtype: float64

In [94]:
p = pd.DataFrame()
p["y_predicted"] = y_predict/1000
p["y_test"] = yt/1000

p["y_predicted"] = p["y_predicted"].round(decimals=1)
# p["y_test"] = p["y_test"].round(decimals=1)
p.describe()

Unnamed: 0,y_predicted,y_test
count,26.0,26.0
mean,196.711538,200.038462
std,111.520955,118.830124
min,19.5,8.0
25%,149.45,153.25
50%,183.0,196.0
75%,228.875,251.75
max,433.8,460.0


In [95]:
print(p)

    y_predicted  y_test
0         190.8   200.0
1         217.4   222.0
2         430.2   460.0
3          34.2     8.0
4         173.2   189.0
5         360.3   391.0
6         167.3   151.0
7         165.5   175.0
8         152.6   160.0
9         118.1    94.0
10         29.7    36.0
11        277.8   284.0
12        230.6   267.0
13        105.9    63.0
14        229.2   205.0
15        166.3   161.0
16        433.8   383.0
17        193.4   204.0
18        227.9   221.0
19        215.6   233.0
20        380.8   402.0
21        175.2   192.0
22        148.4   178.0
23        216.8   258.0
24         19.5    14.0
25         54.0    50.0


In [97]:
rmse/1000

23.29887377886108