In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
data=pd.read_csv('Company_Data.csv')
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [3]:
# Taking mean of sales
data.Sales.mean()

7.496325

In [4]:
# Categorizing sales based om means
data.loc[data['Sales']>=7.5,'sales']='Good'
data.loc[data['Sales']<7.5,'sales']='Poor'

In [5]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,sales
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes,Good
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes,Good
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes,Good
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes,Poor
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No,Poor


In [6]:
data=data.drop(['Sales'],axis=1)
data.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,sales
0,138,73,11,276,120,Bad,42,17,Yes,Yes,Good
1,111,48,16,260,83,Good,65,10,Yes,Yes,Good
2,113,35,10,269,80,Medium,59,12,Yes,Yes,Good
3,117,100,4,466,97,Medium,55,14,Yes,Yes,Poor
4,141,64,3,340,128,Bad,38,13,Yes,No,Poor


In [7]:
# Converting Categorical variables into numerical values
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for column_name in data.columns:
    if data[column_name].dtype==object:
        data[column_name]=le.fit_transform(data[column_name])
    else:
        pass

data

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,sales
0,138,73,11,276,120,0,42,17,1,1,0
1,111,48,16,260,83,1,65,10,1,1,0
2,113,35,10,269,80,2,59,12,1,1,0
3,117,100,4,466,97,2,55,14,1,1,1
4,141,64,3,340,128,0,38,13,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
395,138,108,17,203,128,1,33,14,1,1,0
396,139,23,3,37,120,2,55,11,0,1,1
397,162,26,12,368,159,2,40,18,1,1,1
398,100,79,7,284,95,0,50,12,1,1,1


In [8]:
from sklearn.model_selection import train_test_split
x=data.iloc[:,0:10]
y=data.iloc[:,10]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

In [9]:
num_trees=100
max_features=4
kfold=KFold(n_splits=10,random_state=7,shuffle=True)

In [10]:
# Creating the Model
clf=RandomForestClassifier(n_estimators=num_trees,max_features=max_features)
clf.fit(x_train,y_train)

RandomForestClassifier(max_features=4)

In [11]:
# Predicting the Values
y_pred=clf.predict(x_test)

In [12]:
y_pred

array([0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1])

In [13]:
pred=pd.DataFrame(pd.Series(y_pred))
pred

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,0
...,...
75,0
76,1
77,1
78,0


In [14]:
pred.columns=['Predictions']
pred

Unnamed: 0,Predictions
0,0
1,1
2,0
3,1
4,0
...,...
75,0
76,1
77,1
78,0


In [15]:
pd.set_option('display.max_rows',None)
data_final=pd.concat([data,pred],axis=1)
data_final

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,sales,Predictions
0,138,73,11,276,120,0,42,17,1,1,0,0.0
1,111,48,16,260,83,1,65,10,1,1,0,1.0
2,113,35,10,269,80,2,59,12,1,1,0,0.0
3,117,100,4,466,97,2,55,14,1,1,1,1.0
4,141,64,3,340,128,0,38,13,1,0,1,0.0
5,124,113,13,501,72,0,78,16,0,1,0,1.0
6,115,105,0,45,108,2,71,15,1,0,1,1.0
7,136,81,15,425,120,1,67,10,1,1,0,1.0
8,132,110,0,108,124,2,76,10,0,0,1,1.0
9,132,113,0,131,124,2,76,17,0,1,1,0.0


In [16]:
np.mean(y_pred==y_test)

0.875

In [17]:
model=RandomForestClassifier(n_estimators=num_trees,max_features=max_features)

In [18]:
array=data.values
X=array[:,0:10]
Y=array[:,10]

In [19]:
result=cross_val_score(model,X,Y,cv=kfold)

In [20]:
result

array([0.7  , 0.75 , 0.8  , 0.75 , 0.9  , 0.925, 0.9  , 0.9  , 0.7  ,
       0.85 ])

In [21]:
print(result.mean())

0.8175000000000001
