In [11]:
# importing the data set
dataset = read.csv("50_Startups.csv")

## basic data exploratory

In [12]:
#checking the head of the dataset
head(dataset)

R.D.Spend,Administration,Marketing.Spend,State,Profit
165349.2,136897.8,471784.1,New York,192261.8
162597.7,151377.59,443898.5,California,191792.1
153441.5,101145.55,407934.5,Florida,191050.4
144372.4,118671.85,383199.6,New York,182902.0
142107.3,91391.77,366168.4,Florida,166187.9
131876.9,99814.71,362861.4,New York,156991.1


In [13]:
#looking at the structure of the data set
str(dataset)

'data.frame':	50 obs. of  5 variables:
 $ R.D.Spend      : num  165349 162598 153442 144372 142107 ...
 $ Administration : num  136898 151378 101146 118672 91392 ...
 $ Marketing.Spend: num  471784 443899 407935 383200 366168 ...
 $ State          : Factor w/ 3 levels "California","Florida",..: 3 1 2 3 2 3 1 2 3 1 ...
 $ Profit         : num  192262 191792 191050 182902 166188 ...


In [14]:
# looking at some statistical info about data set
summary(dataset)

   R.D.Spend      Administration   Marketing.Spend         State   
 Min.   :     0   Min.   : 51283   Min.   :     0   California:17  
 1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   Florida   :16  
 Median : 73051   Median :122700   Median :212716   New York  :17  
 Mean   : 73722   Mean   :121345   Mean   :211025                  
 3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469                  
 Max.   :165349   Max.   :182646   Max.   :471784                  
     Profit      
 Min.   : 14681  
 1st Qu.: 90139  
 Median :107978  
 Mean   :112013  
 3rd Qu.:139766  
 Max.   :192262  

In [15]:
#checking if any null values exist in our data set
any(is.na(dataset))

# converting the categorical data into numeric

In [16]:
dataset$State = factor(dataset$State,
                       levels = c("New York", "California", "Florida"),
                       labels = c(1,2,3))

In [17]:
head(dataset)

R.D.Spend,Administration,Marketing.Spend,State,Profit
165349.2,136897.8,471784.1,1,192261.8
162597.7,151377.59,443898.5,2,191792.1
153441.5,101145.55,407934.5,3,191050.4
144372.4,118671.85,383199.6,1,182902.0
142107.3,91391.77,366168.4,3,166187.9
131876.9,99814.71,362861.4,1,156991.1


### spliting the dataset into training and test set

In [19]:
library(caTools)
set.seed(100)
spliter = sample.split(Y = dataset$Profit, SplitRatio = 0.8)
training_set = subset(dataset, spliter == TRUE)
testing_set = subset(dataset, spliter == FALSE)

# building the linear model

In [20]:
model = lm(formula = Profit ~., data = training_set)

In [22]:
# lets check the summary of the model
print(summary(model))


Call:
lm(formula = Profit ~ ., data = training_set)

Residuals:
   Min     1Q Median     3Q    Max 
-32380  -4586   -190   4940  20038 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.171e+04  8.420e+03   4.954 1.97e-05 ***
R.D.Spend       8.146e-01  5.828e-02  13.977 1.18e-15 ***
Administration  1.858e-02  6.098e-02   0.305    0.762    
Marketing.Spend 2.873e-02  2.083e-02   1.379    0.177    
State2          1.878e+03  4.104e+03   0.458    0.650    
State3          1.584e+03  4.024e+03   0.394    0.696    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9938 on 34 degrees of freedom
Multiple R-squared:  0.9496,	Adjusted R-squared:  0.9421 
F-statistic:   128 on 5 and 34 DF,  p-value: < 2.2e-16



### based on the p-value interpertation, only 'R.D.Spend' variable has significant for our model building

In [27]:
#removing the insignificant variable 'Administration'
# building the model with the rest of the indep. variable
model2 = lm(formula = Profit ~ R.D.Spend + Marketing.Spend + State, data = training_set)

In [28]:
# let's print the summary of the new model
print(summary(model2))


Call:
lm(formula = Profit ~ R.D.Spend + Marketing.Spend + State, data = training_set)

Residuals:
   Min     1Q Median     3Q    Max 
-32337  -4930    376   5193  20149 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.390e+04  4.310e+03  10.188  5.2e-12 ***
R.D.Spend       8.198e-01  5.497e-02  14.912  < 2e-16 ***
Marketing.Spend 2.722e-02  1.997e-02   1.363    0.181    
State2          1.884e+03  4.050e+03   0.465    0.645    
State3          1.662e+03  3.963e+03   0.419    0.677    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9808 on 35 degrees of freedom
Multiple R-squared:  0.9494,	Adjusted R-squared:  0.9436 
F-statistic: 164.3 on 4 and 35 DF,  p-value: < 2.2e-16



In [29]:
# we will build our new model by only using the 'R.D.Spend' variable
model3 = lm(formula = Profit ~ R.D.Spend, data = training_set)

In [31]:
# let's print the summary of the new model
print(summary(model3))


Call:
lm(formula = Profit ~ R.D.Spend, data = training_set)

Residuals:
   Min     1Q Median     3Q    Max 
-32393  -4874    134   5177  18628 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4.707e+04  3.085e+03   15.26   <2e-16 ***
R.D.Spend   8.724e-01  3.390e-02   25.73   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9751 on 38 degrees of freedom
Multiple R-squared:  0.9457,	Adjusted R-squared:  0.9443 
F-statistic: 662.2 on 1 and 38 DF,  p-value: < 2.2e-16

