# Importing libraries

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the dataset

In [48]:
data=pd.read_csv(r'C:\Users\abish\Downloads\natural_gas_price.csv')
data

Unnamed: 0,Date,Price
0,07-01-1997,3.82
1,08-01-1997,3.80
2,09-01-1997,3.61
3,10-01-1997,3.92
4,13-01-1997,4.00
...,...,...
5933,05-08-2020,2.23
5934,06-08-2020,2.26
5935,07-08-2020,2.15
5936,10-08-2020,2.18


In [50]:
data.head()


Unnamed: 0,Date,Price
0,07-01-1997,3.82
1,08-01-1997,3.8
2,09-01-1997,3.61
3,10-01-1997,3.92
4,13-01-1997,4.0


In [None]:
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day

In [None]:
data.drop('Date',axis=1,inplace=True)

In [None]:

data

# Handling NULL Values

In [None]:
data.isnull().any()


In [None]:
sns.distplot(data['Price'].dropna())


In [None]:
data['Price'].fillna(data['Price'].median(),inplace=True)


In [None]:
data.isnull().sum()


In [None]:
data.describe()


# Finding and Removing Outliers

In [None]:
sns.boxplot(data['Price'])


In [None]:
from scipy import stats


In [None]:
z=np.abs(stats.zscore(data))
z


In [None]:
threshold=3
np.where(z>threshold)


In [None]:
df_no_outliers=data[(z<=3).all(axis=1)]
df_no_outliers


In [None]:
df_no_outliers.shape


# Data Visualization

In [None]:
fig=plt.figure(figsize=(5,5))
plt.scatter(data['day'],data['Price'],color='pink')
plt.xlabel('day')
plt.ylabel('Price')
plt.title('PRICE OF NATURAL GAS ON THE BASIS OF DAYS OF A MONTH')
plt.legend()

In [None]:
fig=plt.figure(figsize=(5,5))
plt.scatter(data['year'],data['Price'],color='green')
plt.xlabel('month')
plt.ylabel('Price')
plt.title('PRICE OF NATURAL GAS ON THE BASIS OF MONTHS OF A YEAR')
plt.legend()

In [None]:
sns.heatmap(data.corr())


In [None]:
plt.bar(data['month'],data['Price'],color='blue')
plt.xlabel('month')
plt.ylabel('Price')
plt.title('PRICE OF NATURAL GAS ON THE BASIS OF MONTHS OF A YEAR')
plt.legend()

In [None]:
sns.lineplot(x='year',y='Price',data=data,color='blue')


In [None]:
sns.lineplot(x='month',y='Price',data=data,color='blue')


In [None]:
sns.lineplot(x='day',y='Price',data=data,color='blue')


In [None]:
data['Price'].hist(bins=90)


In [None]:
data.boxplot(column='Price',by='month')


In [None]:
sns.pairplot(data)
plt.show()


# Separating Dependent and Independent Variables

In [88]:
x=data.iloc[:,1:4].values
y=data.iloc[:,0].values
x


array([[3.82],
       [3.8 ],
       [3.61],
       ...,
       [2.15],
       [2.18],
       [2.19]])

In [89]:
y

array(['07-01-1997', '08-01-1997', '09-01-1997', ..., '07-08-2020',
       '10-08-2020', '11-08-2020'], dtype=object)

In [90]:
x

array([[3.82],
       [3.8 ],
       [3.61],
       ...,
       [2.15],
       [2.18],
       [2.19]])

In [91]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
x=sc.fit_transform(x)
x


array([[-0.16851934],
       [-0.17764749],
       [-0.26436485],
       ...,
       [-0.93071936],
       [-0.91702715],
       [-0.91246307]])

# Train and Test

In [87]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [86]:
x_train.shape

(4750, 3)

In [85]:
y_train.shape


(4750,)

# Model Building

## Decision Tree Regressor

In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [60]:
DecisionTreeRegressor()
df_grid = GridSearchCV(DecisionTreeRegressor(),param_grid = {'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],'splitter': ['best', 'random'],'max_depth': range(1, 11),'min_samples_split': range(10, 60, 10),},cv=5,n_jobs=1,scoring='neg_mean_squared_error')
df_grid.fit(x_train, y_train)
print(df_grid.best_params_)

{'criterion': 'squared_error', 'max_depth': 10, 'min_samples_split': 10, 'splitter': 'best'}


In [61]:
df=DecisionTreeRegressor(criterion='squared_error',max_depth=10,min_samples_split=10,splitter='best')
df.fit(x_train,y_train)

DecisionTreeRegressor(max_depth=10, min_samples_split=10)

In [62]:
y_pred_df=df.predict(x_test)
y_pred_df

array([3.11933333, 2.88277228, 8.24      , ..., 2.15      , 3.708     ,
       4.6204878 ])

In [63]:
y_test


array([3.56, 3.02, 7.26, ..., 2.09, 4.01, 4.46])

In [64]:
from sklearn.metrics import r2_score
accur_df=r2_score(y_test,y_pred_df)
print(accur_df)


0.9748852419473413


In [93]:
import joblib
joblib.dump(df, r"C:\Users\abish\OneDrive\Documents\Project\gas.joblib")


['C:\\Users\\abish\\OneDrive\\Documents\\Project\\gas.joblib']

## Random Forest Regressor

In [68]:
from sklearn.ensemble import RandomForestRegressor

In [69]:
rfr = RandomForestRegressor(n_estimators=1, random_state=10)
rfr.fit(x_train,y_train)
y_pred_rfr=rfr.predict(x_test)
y_pred_rfr

array([3.54, 3.04, 7.4 , ..., 2.16, 3.77, 4.35])

In [70]:
y_test

array([3.56, 3.02, 7.26, ..., 2.09, 4.01, 4.46])

In [71]:
accur_rfr=r2_score(y_test,y_pred_rfr)
print(accur_rfr)

0.9773693648709125


In [84]:
joblib.dump(rfr,r"C:\Users\abish\OneDrive\Documents\Project\RandomForestRegressor.pkl")

['C:\\Users\\abish\\OneDrive\\Documents\\Project\\RandomForestRegressor.pkl']

# Model Evaluation for Decision Tree Regressor

In [73]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
mean_absolute_error(y_test,y_pred_df)


0.21435522699781956

In [74]:
mean_squared_error(y_test,y_pred_df)

0.1330631627486958

In [75]:
from math import sqrt
sqrt(mean_squared_error(y_test,y_pred_df))

0.36477823776740825

# Model Evaluation for Random Forest Regressor

In [76]:
mean_absolute_error(y_test,y_pred_rfr)

0.18007575757575758

In [77]:
mean_squared_error(y_test,y_pred_rfr)


0.11990176767676768

In [78]:
sqrt(mean_squared_error(y_test,y_pred_rfr))


0.3462683463396094