# Imports

In [108]:
import pandas as pd
import numpy as np 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

# Constants

In [109]:
Source_path = 'D:/Danush/Datasets/1000_Companies.csv'
RD = 0
AD = 1
MK = 2

# Gathering the data

In [110]:
data = pd.read_csv(Source_path)

In [111]:
columns=['R&D Spend','Administration','Marketing Spend','State','Profit']

In [112]:
datas = pd.DataFrame(data=data,columns=columns)

In [113]:
Label = datas['Profit']
datas.drop('Profit',inplace=True,axis=1)

In [114]:
datas.drop('State',inplace=True,axis=1)

In [115]:
datas.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
R&D Spend,1.0,0.582434,0.978407
Administration,0.582434,1.0,0.520465
Marketing Spend,0.978407,0.520465,1.0


# Training and Testing

In [116]:
X_train,X_test,Y_train,Y_test = train_test_split(datas,Label,test_size=0.3,random_state=10)

In [117]:
regr = LinearRegression()
regr.fit(X_train,Y_train)
regr.predict(X_test)

array([ 95005.05063687,  77072.10099319,  76839.31798513,  88729.42691382,
       163212.21904708,  50372.14324651, 116100.04227376,  96576.55260345,
        95395.09474398, 129866.26152676, 154325.79047393, 175382.70609627,
       182640.51861816, 147299.0303215 , 172488.92709488,  79827.42169355,
       151174.99822726,  87057.54255524, 106520.45662243, 172162.68463626,
       155280.28761258, 147537.00562233, 113053.95898295,  53711.58504699,
        72385.28664811, 119976.32607724, 183165.79486533,  69770.15489743,
       143044.89827075, 162256.8566419 ,  94250.4526689 , 107772.63917736,
       183094.83501339, 127605.06389491, 154945.39156385,  96755.68306542,
       161457.25970788,  99317.16214435, 172620.46245447,  94913.32195056,
        92832.12016281, 157223.89638596,  56591.51827004,  73719.67881158,
       166382.91474598,  64316.62758136,  97571.72190304, 104216.8562865 ,
       143622.961702  ,  59286.26347282, 133201.47399802,  58805.12068191,
       146476.06864798, 1

# Evaluation

In [118]:
Accuracy = regr.score(X_test,Y_test)
Coeff = regr.coef_
Intercept = regr.intercept_

In [119]:
Stats_X = sm.add_constant(X_train)
model = sm.OLS(Y_train,Stats_X)
results = model.fit()

In [120]:
pd.DataFrame(data=results.pvalues,index=columns,columns=['Pvalues'])

Unnamed: 0,Pvalues
R&D Spend,2.617811e-25
Administration,5.223176e-137
Marketing Spend,9.020117e-06
State,
Profit,


# Generating the User Inputs :

In [121]:
Mean = datas.mean().values.reshape(1,3)
print(Mean)

[[ 81668.9272     122963.8976117  226205.05841883]]


In [140]:
def User_inputs(R_and_D,Admin,Marketing):
    
    Mean[0][RD]=R_and_D 
    Mean[0][AD]=Admin
    Mean[0][MK]=Marketing
    
    Prediction = regr.predict(Mean)[0]
    
    print("The profit is estimated to be:",round(Prediction,2))

In [141]:
R_and_D = int(input('Enter the amount spent in R&D:'))
Admin = int(input('Enter the amount spent in Administration:'))
Marketing = int(input('Enter the amount spent in Marketing:'))
User_inputs(R_and_D,Admin,Marketing)
print(f'The accuracy of the model:{round(Accuracy*100,2)} %')

Enter the amount spent in R&D:211543
Enter the amount spent in Administration:14522
Enter the amount spent in Marketing:14522
The profit is estimated to be: 37912.4
The accuracy of the model:96.51 %
