### Usecase: A venture capitalist company has hired you as a Data Scientist. Your role is to create and deploy a model that can predict the profit of the given startup company based on company's spending pattern and company's office location.

In [1]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv('50_Startups.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [11]:
data["State"] = data["State"].astype('category')
data.dtypes

R&D Spend           float64
Administration      float64
Marketing Spend     float64
State              category
Profit              float64
dtype: object

In [12]:
data["State_cat"] = data["State"].cat.codes
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_cat
0,165349.2,136897.8,471784.1,New York,192261.83,2
1,162597.7,151377.59,443898.53,California,191792.06,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1
3,144372.41,118671.85,383199.62,New York,182901.99,2
4,142107.34,91391.77,366168.42,Florida,166187.94,1


In [10]:
data['State2'] = data.State.cat.State
data.head()

AttributeError: Can only use .cat accessor with a 'category' dtype

In [None]:
value=str.replace("Sixty Two","62")

In [4]:
# Seperating data as features and label
features = data.iloc[:,[0,1,2,3]].values
label = data.iloc[:,[4]].values

In [5]:
# Deal with Missing Data
# No missing data , so no analysis reqd


In [6]:
# Deal with Categorical Data --- State

# If your dataset has multiple categorical columns, you will create object for each column
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
#Encode Country Column
labelencoder_X = LabelEncoder()
features[:,3] = labelencoder_X.fit_transform(features[:,3])
features

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2],
       [142107.34, 91391.77, 366168.42, 1],
       [131876.9, 99814.71, 362861.36, 2],
       [134615.46, 147198.87, 127716.82, 0],
       [130298.13, 145530.06, 323876.68, 1],
       [120542.52, 148718.95, 311613.29, 2],
       [123334.88, 108679.17, 304981.62, 0],
       [101913.08, 110594.11, 229160.95, 1],
       [100671.96, 91790.61, 249744.55, 0],
       [93863.75, 127320.38, 249839.44, 1],
       [91992.39, 135495.07, 252664.93, 0],
       [119943.24, 156547.42, 256512.92, 1],
       [114523.61, 122616.84, 261776.23, 2],
       [78013.11, 121597.55, 264346.06, 0],
       [94657.16, 145077.58, 282574.31, 2],
       [91749.16, 114175.79, 294919.57, 1],
       [86419.7, 153514.11, 0.0, 2],
       [76253.86, 113867.3, 298664.47, 0],
       [78389.47, 153773.43, 299737.29, 2],
       [73994.56, 122782.75, 30331

In [None]:
encoder = OneHotEncoder(sparse=False)
onehot = encoder.fit_transform()

In [12]:

data.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [13]:
labelencoder_X.classes_

array([0, 1, 2], dtype=object)

In [14]:
#Create Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=39)

In [15]:
#Initialize the algo
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [16]:
#Check for Generalization
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.9387396277440356
0.9703301403285373


In [156]:
#Its a generalized model

In [157]:
#Deployment 
# 1. Test user input

In [13]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_cat
0,165349.2,136897.8,471784.1,New York,192261.83,2
1,162597.7,151377.59,443898.53,California,191792.06,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1
3,144372.41,118671.85,383199.62,New York,182901.99,2
4,142107.34,91391.77,366168.42,Florida,166187.94,1


In [18]:
#User Interaction
rdSpend = float(input("Enter R&D Spend: "))
admSpend = float(input("Enter Admin Spend: "))
markSpend = float(input("Enter Marketing Spend: "))
state = int(input("Enter State: 0 cal 1 flor 2 new york"))

#My model can understand only California, Florida and NY 

if state in labelencoder_X.classes_:
   #Lets create feature variable
    featureArray = np.array([[rdSpend,admSpend,markSpend,state]])

    #Lets do feature processing to make the input compatible for the model
    # I will need StateLabelEncoder object used during training
    # I will need StateOHE object used during training

    features[:,3] = labelencoder_X.fit_transform(features[:,3])
    
    
    #Give featureArray to model
    profit = model.predict(featureArray)
    print("Estimated profit as calculated by Siri is {}".format(profit))

else:
    print("Siri says she don't know about {} state".format(state))




Enter R&D Spend: 25
Enter Admin Spend: 65
Enter Marketing Spend: 6
Enter State: New York
Siri says she don't know about New York state


In [27]:
#Which Objects needs to be deployed
# model
# stateLabelEncoder
# stateOHE
import pickle
pickle.dump(model,open('ProfitPredictor.model','wb'))
pickle.dump(stateLabelEncoder,open('StateEncoder.encoder','wb'))
pickle.dump(stateOHE,open('StateDummyVariableCreator.dummy','wb'))

In [28]:
#Equation of line
model.intercept_

array([51961.88125104])

In [29]:
model.coef_

array([[-7.32935355e+02,  1.03082545e+03, -2.97890092e+02,
         7.84305020e-01, -3.24611810e-02,  2.45598550e-02]])

In [93]:
#profit = -7.32935355e+02 (California) +  1.03082545e+03 (Florida) - 2.97890092e+02 (New York) 
# + 7.84305020e-01 (R&D Spend) - 3.24611810e-02 (Admin Spend) + 2.45598550e-02 (Market )

array([[1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]])