In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from scipy import stats 
import matplotlib.pyplot as plt
from empiricaldist import Pmf , Cdf
from matplotlib.ticker import PercentFormatter

import warnings
warnings.filterwarnings("ignore")

## OVERFITTING or Underfitting ==> get the variance to show overfitting if is HIGH !! 
## UNERFITTING when there's a HIGH Bias

In [2]:
df_awe = pd.read_csv('QIPQ1_avg_weekly_earnings.csv')# awe average weekly earnings

In [3]:
df_awe.tail()

Unnamed: 0,Industry Sector,Type of Employee,Quarter,Statistic Label,UNIT,VALUE
4315,Mining and Quarrying (NACE 10-14),Industrial Employees,2006Q2,Average Weekly Earnings (Euro),Euro,
4316,Mining and Quarrying (NACE 10-14),Industrial Employees,2006Q3,Average Weekly Earnings (Euro),Euro,
4317,Mining and Quarrying (NACE 10-14),Industrial Employees,2006Q4,Average Weekly Earnings (Euro),Euro,
4318,Mining and Quarrying (NACE 10-14),Industrial Employees,2007Q1,Average Weekly Earnings (Euro),Euro,
4319,Mining and Quarrying (NACE 10-14),Industrial Employees,2007Q2,Average Weekly Earnings (Euro),Euro,


In [4]:
print("CHECKING the shape: ",df_awe.shape)#Checking shape
print("\n CHECKING NULL values :\n",df_awe.dtypes)#checking data types
#print("\n Check info:\n",df_fnie.info)# checking info
print("\n COUNT is :\n",df_awe.count() )#count rows
print("\n CHECKING NULL values :\n",df_awe.isnull().sum())#checking null values

CHECKING the shape:  (4320, 6)

 CHECKING NULL values :
 Industry Sector      object
Type of Employee     object
Quarter              object
Statistic Label      object
UNIT                 object
VALUE               float64
dtype: object

 COUNT is :
 Industry Sector     4320
Type of Employee    4320
Quarter             4320
Statistic Label     4320
UNIT                4320
VALUE               3420
dtype: int64

 CHECKING NULL values :
 Industry Sector       0
Type of Employee      0
Quarter               0
Statistic Label       0
UNIT                  0
VALUE               900
dtype: int64


### - Column VALUE does not have all the values like the rest.
### - Also, there are 900 null values

In [5]:
df_awe.drop(['UNIT','Statistic Label'], axis=1,inplace=True)# dropping column because i will not use it.

In [6]:
df_awe.rename(columns = {'Industry Sector':'industry_sector',
                          'VALUE':'value',
                          'Type of Employee':'type_of_employee',
                          'Quarter':'quarter'}, 
                            inplace = True) #Changing column names

In [7]:
df_awe.head()

Unnamed: 0,industry_sector,type_of_employee,quarter,value
0,All Industries (NACE 1-4),All Employees,1995Q3,
1,All Industries (NACE 1-4),All Employees,1995Q4,402.96
2,All Industries (NACE 1-4),All Employees,1996Q1,402.27
3,All Industries (NACE 1-4),All Employees,1996Q2,410.23
4,All Industries (NACE 1-4),All Employees,1996Q3,407.09


In [8]:
df_awe["value"].fillna(0.0, inplace = True)

In [9]:
print("CHECKING NULL in column value :",df_awe['value'].isnull().sum())#checking null values

CHECKING NULL in column value : 0


In [10]:
from sklearn.preprocessing import LabelEncoder

cols = ['industry_sector', 'type_of_employee', 'quarter']# cloumns to encode

df_awe[cols] = df_awe[cols].apply(LabelEncoder().fit_transform)# apply encoder before using model

In [11]:
df_awe.head()

Unnamed: 0,industry_sector,type_of_employee,quarter,value
0,0,0,0,0.0
1,0,0,1,402.96
2,0,0,2,402.27
3,0,0,3,410.23
4,0,0,4,407.09


## Split data to train and test

In [12]:
# import train_test_split from the library sklearn
from sklearn.model_selection import train_test_split

# Declare a variable named as 'X' and 'y'
X = df_awe.iloc[:, :-1].values               # All features (independent varaibles) except MEDV 
y = df_awe['value'].values                    # Target variable

# Split the data into 70% and 30% by using a parameter test_size = 30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Display the size of the rows and columns
X.shape, y.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4320, 3), (4320,), (3024, 3), (1296, 3), (3024,), (1296,))

## Apply Random Forest Regression

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth = 8, random_state = 10)

rf.fit(X_train, y_train)

y_pred_rfr = rf.predict(X_test)

In [14]:
from sklearn import metrics

print("Accuracy:",metrics.r2_score(y_test,y_pred_rfr))
print('Accuracy in SVR train is: ',metrics.mean_squared_error(y_test, y_pred_rfr))

Accuracy: 0.9422713656437022
Accuracy in SVR train is:  5937.896914275261


In [15]:
pip install xgboost # install xgboost because is not iontalled

Note: you may need to restart the kernel to use updated packages.


## Apply XGBoost in Regression

In [49]:
from xgboost import XGBRegressor

xbgr = XGBRegressor(n_estimators = 200,
                    learning_rate=0.35,max_depth = 3,
                    min_child_weight = 3,subsample = 0.55,
                    colsample_bytree = 0.7)

xbgr.fit(X_train,y_train)

print('score is : ',xbgr.score(X_test,y_test))

score is :  0.9887070889035205


In [41]:
y_predict_xgb = xbgr.predict(X_test)

In [42]:
print('Accuracy in XGBRegressor = ', round(metrics.r2_score(y_test, y_predict_xgb)*100,2))
print('mean squared error in SVR train is: ',metrics.mean_squared_error(y_test, y_predict_xgb))

Accuracy in XGBRegressor =  98.87
mean squared error in SVR train is:  1161.5750606380816


In [48]:
from sklearn.model_selection import GridSearchCV

model_in_gscv = XGBRegressor()

parameters_xgb = { 
      'learning_rate': [0.045,0.05,0.06], 
      'max_depth': [3,4,5],
      'min_child_weight': [2,3,4],
      'subsample': [0.5,0.55,0.6],
      'colsample_bytree': [0.7,0.8,0.85],
      'n_estimators': [200,300,500]}

xgb_grid = GridSearchCV(model_in_gscv,
                        parameters_xgb,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(X_train, y_train)#it takes 10 seconds to complete.


Fitting 2 folds for each of 729 candidates, totalling 1458 fits


In [47]:
print("best score is: ",xgb_grid.best_score_)
print('best params are: ',xgb_grid.best_params_)

best score is:  0.9918613994996384
best params are:  {'colsample_bytree': 0.7, 'learning_rate': 0.06, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 500, 'subsample': 0.5}
