Problem Statement:
A cloth manufacturing company is interested to know about the segment or attributes causes high sale. 
Approach - A decision tree can be built with target variable Sale (we will first convert it in categorical variable) & all other variable will be independent in the analysis.  

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Company_Data.csv')
df

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [3]:
#check any null values present
df.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

Urban and US - columns are categorical column has to converted into numerical and
ShelveLoc - to ordinal encoding because the values in column is ordinal

In [4]:
#Converting categoical into numerical - Urban and US
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

le= LabelEncoder()
df['Urban'] = le.fit_transform(df['Urban'])
df['US'] = le.fit_transform(df['US'])
df['ShelveLoc'] = le.fit_transform(df['ShelveLoc'])

In [5]:
df

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,0,42,17,1,1
1,11.22,111,48,16,260,83,1,65,10,1,1
2,10.06,113,35,10,269,80,2,59,12,1,1
3,7.40,117,100,4,466,97,2,55,14,1,1
4,4.15,141,64,3,340,128,0,38,13,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,1,33,14,1,1
396,6.14,139,23,3,37,120,2,55,11,0,1
397,7.41,162,26,12,368,159,2,40,18,1,1
398,5.94,100,79,7,284,95,0,50,12,1,1


In [6]:
#Split data into training and testing

x = df.iloc[:,1:]
y = df['Sales']

xtrain,xtext,ytrain,ytest = train_test_split(x,y,test_size=0.2, random_state=42)
xtrain

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
3,117,100,4,466,97,2,55,14,1,1
18,110,110,0,408,68,1,46,17,0,1
202,121,78,4,413,130,0,46,10,0,1
250,137,105,10,435,156,1,72,14,1,1
274,135,93,2,67,119,2,34,11,1,1
...,...,...,...,...,...,...,...,...,...,...
71,148,51,16,148,150,2,58,17,0,1
106,102,33,0,217,139,2,70,18,0,0
270,119,26,0,284,89,1,26,10,1,0
348,132,102,20,459,107,1,49,11,1,1


# Model building

In [7]:
from sklearn.metrics import r2_score
dt = DecisionTreeRegressor()
dt.fit(xtrain,ytrain)
ypred = dt.predict(xtext)

r2_score(ytest,ypred)

0.35989638348633035

In [8]:
#This model contains Overfitting probelm

dt.score(xtrain,ytrain), dt.score(xtext,ytest)

(1.0, 0.35989638348633035)

# Hyperparameter tuning

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
params = {
    'criterion' : ["squared_error", "friedman_mse", "absolute_error"],
    'splitter' : ['best','random'],
    'max_depth' : [2,3,4,5,6,7,8,9,10,11]
    }

In [11]:
grid = GridSearchCV(DecisionTreeRegressor(), param_grid=params)

In [12]:
grid.fit(xtrain,ytrain)

GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['squared_error', 'friedman_mse',
                                       'absolute_error'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                         'splitter': ['best', 'random']})

In [13]:
#Finding out the best parameter 
grid.best_params_

{'criterion': 'absolute_error', 'max_depth': 6, 'splitter': 'random'}

In [14]:
#create the model again using above parameter

dt = DecisionTreeRegressor(criterion='squared_error', max_depth=5, splitter='random')

In [15]:
dt.fit(xtrain,ytrain)
ypred = dt.predict(xtext)

print(dt.score(xtrain,ytrain))
dt.score(xtext,ytest)

0.5783213285287871


0.4423030986714699