# Decision Tree Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

### setting the date column 

In [2]:
data = pd.read_csv('telangana_final.csv')
data['Date'].head()

0    01/01/2011
1    18/04/2011
2    19/04/2011
3    20/04/2011
4    21/04/2011
Name: Date, dtype: object

In [3]:
import datetime as dt
data['Date'] = pd.to_datetime(data['Date'])
data['Date'].head()

0   2011-01-01
1   2011-04-18
2   2011-04-19
3   2011-04-20
4   2011-04-21
Name: Date, dtype: datetime64[ns]

### getting the empty value 

In [4]:
data[data == np.inf] = np.nan
data.fillna(data.mean() , inplace = True)

In [5]:
X = data.iloc[: , [0,1,2,4]].values
X

array([[2011, 'Adilabad', 'Cotton (Unginned)', 1],
       [2011, 'Adilabad', 'Cotton (Unginned)', 4],
       [2011, 'Adilabad', 'Cotton (Unginned)', 4],
       ...,
       [2020, 'Warangal', 'Cotton (Unginned)', 5],
       [2020, 'Warangal', 'Cotton (Unginned)', 5],
       [2020, 'Warangal', 'Cotton (Unginned)', 5]], dtype=object)

In [6]:
print(X[:,1])

['Adilabad' 'Adilabad' 'Adilabad' ... 'Warangal' 'Warangal' 'Warangal']


In [7]:
from sklearn.preprocessing import LabelEncoder
le_1 = LabelEncoder()
le_2 = LabelEncoder()
X[:,1] = le_1.fit_transform(X[:,1])
X[:,2] = le_2.fit_transform(X[:,2])
pd.DataFrame(X).to_csv('LabelEncoded_telangana_final.csv')
X

array([[2011, 0, 2, 1],
       [2011, 0, 2, 4],
       [2011, 0, 2, 4],
       ...,
       [2020, 3, 2, 5],
       [2020, 3, 2, 5],
       [2020, 3, 2, 5]], dtype=object)

In [8]:
y = data.iloc[:,5].values
y

array([4200, 5525, 5525, ..., 4450, 4425, 4550], dtype=int64)

### train test split 

In [9]:
from sklearn.model_selection import train_test_split as tts
X_train ,X_test, y_train, y_test = tts(X,y,test_size = 0.2,random_state = 0)
y_train

array([4340, 4990, 4263, ..., 3800, 4000, 3850], dtype=int64)

In [10]:
X_train

array([[2011, 1, 5, 1],
       [2013, 3, 5, 8],
       [2016, 0, 2, 3],
       ...,
       [2014, 3, 3, 2],
       [2015, 0, 4, 11],
       [2013, 0, 2, 5]], dtype=object)

## Training the Decision Tree Regression model on the Training set

In [11]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)

DecisionTreeRegressor()

In [12]:
print(X_test)

[[2014 0 2 12]
 [2014 0 4 9]
 [2016 2 1 11]
 ...
 [2014 2 0 12]
 [2017 2 1 1]
 [2016 2 2 6]]


In [13]:
y_pred_dtr = dtr.predict(X_test)
print(np.concatenate((y_pred_dtr.reshape(len(y_pred_dtr),1), y_test.reshape(len(y_test),1)),1))

[[4050.         4050.        ]
 [4000.         4000.        ]
 [4095.65217391 4500.        ]
 ...
 [4100.         4100.        ]
 [4478.26086957 3800.        ]
 [4000.         4000.        ]]


In [14]:
import joblib as jb
jb.dump(dtr,'decision_tree_telangana.ml')
jb.dump(dtr,'decision_tree_telangana.pkl')
jb.dump(dtr,'decision_tree_telangana.dat')

['decision_tree_telangana.dat']

In [15]:
jb.dump(le_1,'le_1.pkl')
jb.dump(le_2,'le_2.pkl')

['le_2.pkl']

## Evaluating the Model Performance

In [16]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_dtr)

0.9126466214923368

In [17]:
y_pred = dtr.predict([[2020,le_1.transform(['Adilabad']),le_2.transform(['Cotton (Unginned)']),3]])

In [18]:
print(y_pred)

[3974.44444444]


### Random forest Regression 

In [19]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)

RandomForestRegressor()

In [20]:
y_pred_rfr = rfr.predict(X_test)
print(np.concatenate((y_pred_rfr.reshape(len(y_pred_rfr),1), y_test.reshape(len(y_test),1)),1))

[[4050.         4050.        ]
 [4000.         4000.        ]
 [4088.72784344 4500.        ]
 ...
 [4100.         4100.        ]
 [4502.79797236 3800.        ]
 [4000.         4000.        ]]


In [21]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_rfr)

0.9128555595728136

In [22]:
import joblib as jb
jb.dump(rfr,'random_forest_telangana.ml')
jb.dump(rfr,'random_forest_telangana.pkl')

['random_forest_telangana.pkl']