In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor

In [3]:
df=pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [7]:
x=df[['R&D Spend', 'Administration', 'Marketing Spend']]
y=df['Profit']

In [8]:
x_train ,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [9]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [10]:
models={
    'LR':LinearRegression(),
    'KNN':KNeighborsRegressor(),
    'SVR':SVR(),
    'DT':DecisionTreeRegressor(),
    'XGBR':XGBRFRegressor(),
    'RF':RandomForestRegressor()
}

In [11]:
for name,model in models.items():
    print(f'using: {name}')
    model.fit(x_train,y_train)
    print(f'Train Accuracy: {model.score(x_train,y_train)}')
    print(f'Test Accuracy: {model.score(x_test,y_test)}')
    print('-'*30,'\n')
    

using: LR
Train Accuracy: 0.9502042999411326
Test Accuracy: 0.9411974407705612
------------------------------ 

using: KNN
Train Accuracy: 0.8808293667472917
Test Accuracy: 0.7965229349217278
------------------------------ 

using: SVR
Train Accuracy: -0.015170442656518235
Test Accuracy: -0.4279660942243313
------------------------------ 

using: DT
Train Accuracy: 1.0
Test Accuracy: 0.8617890070353016
------------------------------ 

using: XGBR
Train Accuracy: 0.9954894204784754
Test Accuracy: 0.9375399870693124
------------------------------ 

using: RF
Train Accuracy: 0.9917437302644745
Test Accuracy: 0.9246652421257844
------------------------------ 



In [12]:
model=LinearRegression()
model.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
y_pred=model.predict(x_test)

In [14]:
model.score(x_train,y_train) # accurcey of the data

0.9502042999411326

In [15]:
model.score(x_test,y_test)

0.9411974407705612

In [16]:
import joblib
joblib.dump(model,'model.h5')
joblib.dump(scaler,'scaler.h5')

['scaler.h5']