In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
priceCol = pd.read_csv('../input/cottonprice/PriceYear.csv')

# Data Preprocessing

In [None]:
priceCol = priceCol.iloc[7:,:]

In [None]:
priceCol

In [None]:
priceCol.drop(['Change'],axis=1,inplace=True)

In [None]:
priceCol.info()

In [None]:
priceCol['year'] = priceCol['Month'].str.split("-",expand = True)[1]

In [None]:
priceCol['month'] = priceCol['Month'].str.split('-',expand = True)[0]

In [None]:
yeardict = {'01':2001,'02':2002,'03':2003,'04':2004,'05':2005,'06':2006,'07':2007,'08':2008,'09':2009,
           '10':2010,'11':2011,'12':2012,'13':2013,'14':2014,'15':2015,'16':2016,'17':2017,'18':2018,'19':2019,'20':2020}

In [None]:
priceCol['year'] = priceCol['year'].map(yeardict)

In [None]:
monthdict = {'Jan': 1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
priceCol['month'] = priceCol['month'].map(monthdict)

In [None]:
priceCol.drop(['Month'],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(priceCol['year'],priceCol['Price'])
plt.xticks(rotation = '90')

# Taking Production Factor

In [None]:
prod = pd.read_csv('../input/cottonpricedata/cotton_sih/Production.csv')

In [None]:
prod.head(5)

In [None]:
prod = prod.iloc[41:,:2]

In [None]:
prod

In [None]:
productionDict = {2001:12300.00000,2002:10600.00000,2003:14000.00000,2004:19000.00000,2005:19050.00000,2006:22200.00000,
                 2007:24200.00000,2008:23000.00000,2009:24200.00000,2010:26900.00000,2011:28700.00000,2012:28500.00000,
                 2013:31000.00000,2014:29500.00000,2015:25900.00000,2016:27000.00000,2017:29000.00000,2018:25800.00000,
                 2019:30500.00000,2020:26450.45876}

In [None]:
priceCol['Production'] = priceCol['year']

In [None]:
priceCol['Production'] = priceCol['Production'].map(productionDict)

In [None]:
priceCol

# Oil Price Factor

In [None]:
oilPrice = pd.read_csv('../input/cottonpricedata/cotton_sih/BrentOilPrices.csv') 

In [None]:
oilPrice.head(3457)

In [None]:
oilPrice = oilPrice.iloc[3453:,:]

In [None]:
oilPrice.head(4872)

In [None]:
oilPrice['Year'] = oilPrice['Date'].str.split('-',expand = True)[2]

In [None]:
oilPrice

In [None]:
oilPrice.info()

In [None]:
oilPrice.head(100)

In [None]:
oilPrice = oilPrice.fillna("20")

In [None]:
oilPrice.info()

In [None]:
oilPrice.drop(['Date'],axis=1,inplace=True)

In [None]:
yeardict = {'01':2001,'02':2002,'03':2003,'04':2004,'05':2005,'06':2006,'07':2007,'08':2008,'09':2009,
           '10':2010,'11':2011,'12':2012,'13':2013,'14':2014,'15':2015,'16':2016,'17':2017,'18':2018,'19':2019,'20':2020}

In [None]:
oilPrice['Year'] = oilPrice['Year'].map(yeardict)

In [None]:
oilPrice = oilPrice.groupby(by="Year",axis=0,as_index=False).mean()

In [None]:
oilPrice

In [None]:
OilpriceDict = {2001:24.455720,2002:24.993255,2003:28.850814,2004:38.259693,2005:54.574553,2006:65.161765,
                 2007:72.441160,2008:96.944348,2009:61.738770,2010:79.609444,2011:111.264274,2012:111.570683,
                 2013:108.555000,2014:98.969606,2015:52.316549,2016:43.638000,2017:54.124805,2018:71.335000,
                 2019:64.319845,2020:45.544615}

In [None]:
priceCol['Year'] = priceCol['year']

In [None]:
priceCol['oilPrice'] = priceCol['Year'].map(OilpriceDict)

In [None]:
priceCol.drop(['Year'],axis=1,inplace=True)

In [None]:
sns.barplot(priceCol['oilPrice'],priceCol['Price'])
plt.xticks(rotation = '90')

# Final Dataset For Price Prediction

In [None]:
priceCol.to_csv("Mydataset.csv",index = False)

In [None]:
X = priceCol[['year','month','Production','oilPrice']]
y = priceCol['Price']

In [None]:
X = X.values
y = y.values

In [None]:
print(X.shape,y.shape)

# TRAIN TEST SPLIT

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# MODELLING

In [None]:
# All Linear models we can Apply
import xgboost
from sklearn.linear_model import ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

# For accuracy and Rmse
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.metrics import explained_variance_score
import math
import joblib
import pickle

# ACCURACY

In [None]:
# AdaBoostRegressor
ad = AdaBoostRegressor()
ad.fit(X_train,y_train)
pred = ad.predict(X_test)
print("Accuracy Score -> ",r2_score(y_test,pred))
print("Rmse -> ",math.sqrt(mean_squared_error(y_test,pred)))

In [None]:
# XGB
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)
predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))

In [None]:
print("Accuracy of the model is given by :-> ",explained_variance_score(predictions,y_test)*100,"%")

## This is the best accuracy acheived till now trying to improve further.

# Thank You