## Random Forest
### Company Data 

In [1]:
# A cloth manufacturing company is interested to know about the segment or attributes causes high sale. 
# Approach - A Random Forest can be built with target variable Sales (we will first convert it in categorical variable) & all other variable will be independent in the analysis.  


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [4]:
company = pd.read_csv('Company_Data.csv')
company.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [5]:
#Creating dummy vairables
company=pd.get_dummies(company,columns=['ShelveLoc','Urban','US'], drop_first=True)

In [6]:
cut_labels=['Low','Medium','High']
cut_bins=[-1,5.66,12,17]
company['sales']=pd.cut(company['Sales'],labels=cut_labels,bins=cut_bins)

In [7]:
company.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,Urban_Yes,US_Yes,sales
0,9.5,138,73,11,276,120,42,17,0,0,1,1,Medium
1,11.22,111,48,16,260,83,65,10,1,0,1,1,Medium
2,10.06,113,35,10,269,80,59,12,0,1,1,1,Medium
3,7.4,117,100,4,466,97,55,14,0,1,1,1,Medium
4,4.15,141,64,3,340,128,38,13,0,0,1,0,Low


In [8]:
X = company.iloc[:,1:12]
Y = company.iloc[:,-1]

In [9]:
# Splitting data into train & test
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=0)

In [10]:
model= RandomForestClassifier(n_estimators=100,max_depth=10,min_samples_split=20,criterion='gini')

In [11]:
model.fit(Xtrain,Ytrain)

RandomForestClassifier(max_depth=10, min_samples_split=20)

In [12]:
print('Train accuracy: {}'.format(model.score(Xtrain, Ytrain)))

Train accuracy: 0.875


In [13]:
print('Test accuracy: {}'.format(model.score(Xtest, Ytest)))

Test accuracy: 0.775
