## **Data Exploration**

In [1]:
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()

Saving 50_Startups.csv to 50_Startups.csv


In [3]:
data=pd.read_csv('/content/50_Startups.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
data.shape

(50, 5)

In [5]:
data.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [6]:
data.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [8]:
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


## **Data Preparation**

In [9]:
data.isnull().sum()   #to check for any null/missing values.

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [10]:
data.isnull().sum().sum()

0

In [11]:
data['State'].nunique()

3

In [12]:
data['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [13]:
# converting State column which is object datatype to int data type.
columns=['State']
data1=data[columns]
dummies=pd.get_dummies(data1,columns=['State'])
dummies

Unnamed: 0,State_California,State_Florida,State_New York
0,False,False,True
1,True,False,False
2,False,True,False
3,False,False,True
4,False,True,False
5,False,False,True
6,True,False,False
7,False,True,False
8,False,False,True
9,True,False,False


In [14]:
mergeddata= pd.concat([data,dummies],axis='columns')
mergeddata

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,New York,192261.83,False,False,True
1,162597.7,151377.59,443898.53,California,191792.06,True,False,False
2,153441.51,101145.55,407934.54,Florida,191050.39,False,True,False
3,144372.41,118671.85,383199.62,New York,182901.99,False,False,True
4,142107.34,91391.77,366168.42,Florida,166187.94,False,True,False
5,131876.9,99814.71,362861.36,New York,156991.12,False,False,True
6,134615.46,147198.87,127716.82,California,156122.51,True,False,False
7,130298.13,145530.06,323876.68,Florida,155752.6,False,True,False
8,120542.52,148718.95,311613.29,New York,152211.77,False,False,True
9,123334.88,108679.17,304981.62,California,149759.96,True,False,False


In [15]:
newdata=mergeddata.drop(['State'],axis='columns')
newdata

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,False,False,True
1,162597.7,151377.59,443898.53,191792.06,True,False,False
2,153441.51,101145.55,407934.54,191050.39,False,True,False
3,144372.41,118671.85,383199.62,182901.99,False,False,True
4,142107.34,91391.77,366168.42,166187.94,False,True,False
5,131876.9,99814.71,362861.36,156991.12,False,False,True
6,134615.46,147198.87,127716.82,156122.51,True,False,False
7,130298.13,145530.06,323876.68,155752.6,False,True,False
8,120542.52,148718.95,311613.29,152211.77,False,False,True
9,123334.88,108679.17,304981.62,149759.96,True,False,False


In [16]:
#converting data into int datatype to avoid errors below.
prepareddata=newdata.astype(int)
prepareddata.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349,136897,471784,192261,0,0,1
1,162597,151377,443898,191792,1,0,0
2,153441,101145,407934,191050,0,1,0
3,144372,118671,383199,182901,0,0,1
4,142107,91391,366168,166187,0,1,0


In [17]:
prepareddata.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_California', 'State_Florida', 'State_New York'],
      dtype='object')

In [18]:
prepareddata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   R&D Spend         50 non-null     int64
 1   Administration    50 non-null     int64
 2   Marketing Spend   50 non-null     int64
 3   Profit            50 non-null     int64
 4   State_California  50 non-null     int64
 5   State_Florida     50 non-null     int64
 6   State_New York    50 non-null     int64
dtypes: int64(7)
memory usage: 2.9 KB


## **Training Our Data**



In [19]:
# Import train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split
# Here, X is the data which will have features and y will have our target.
x=prepareddata[['R&D Spend', 'Administration', 'Marketing Spend','State_California', 'State_Florida', 'State_New York']]
y=prepareddata['Profit']

In [20]:
# Split data into training data and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
#Ratio used for splitting training and testing data is 8:2 respectively

## **Model Creation**

### Linear Regression

In [21]:
# Importing linear regression model
from sklearn.linear_model import LinearRegression
reg1 = LinearRegression()

In [22]:
# Fitting data into the model.
reg1.fit(x_train, y_train)

In [23]:
# Making predictions
pred1 = reg1.predict(x_test)

In [24]:
pred1

array([159195.6284126 , 115706.51720952, 182093.71640652,  70618.43633948,
       172584.55813857, 101387.38767092, 109548.90385305,  52114.54122492,
        98272.21933333,  98310.48848852])

### Lasso Regression

In [25]:
# Importing model
from sklearn.linear_model import Lasso
reg2 = Lasso()

In [26]:
# Fitting data into the model.
reg2.fit(x_train, y_train)

In [27]:
# Making predictions
pred2 = reg2.predict(x_test)

In [28]:
pred2

array([159198.13141959, 115709.63673876, 182090.45890379,  70617.80682652,
       172581.0450208 , 101383.81521362, 109545.44331583,  52112.97716738,
        98271.87216949,  98306.62032563])

### Ridge Regression

In [29]:
# Importing model
from sklearn.linear_model import Ridge
reg3 = Ridge()

In [30]:
# Fitting data into the model.
reg3.fit(x_train, y_train)

In [31]:
# Making predictions
pred3= reg3.predict(x_test)


In [32]:
pred3

array([159216.67544601, 115727.74572372, 182080.77215251,  70604.85079995,
       172569.79683738, 101372.24170473, 109532.3666592 ,  52095.55836263,
        98263.4935844 ,  98291.5759707 ])

## **Performance Check**

In [33]:
import numpy as np
from sklearn.metrics import mean_squared_error
print("Model\t\t\t RootMeanSquareError \t\t Accuracy of the model")
print("""Linear Regression \t\t {:.4f} \t \t\t {:.4f}""".format(  np.sqrt(mean_squared_error(y_test, pred1)), reg1.score(x_train,y_train)))
print("""Lasso Regression \t\t {:.4f} \t \t\t {:.4f}""".format(  np.sqrt(mean_squared_error(y_test, pred2)), reg2.score(x_train,y_train)))
print("""Ridge Regression \t\t {:.4f} \t \t\t {:.4f}""".format(  np.sqrt(mean_squared_error(y_test, pred3)), reg3.score(x_train,y_train)))

Model			 RootMeanSquareError 		 Accuracy of the model
Linear Regression 		 7269.0089 	 		 0.9437
Lasso Regression 		 7268.7957 	 		 0.9437
Ridge Regression 		 7266.8101 	 		 0.9437


#### **Conclusion**
* All 3 regression algorithms used in this project are equally efficient for the given dataset.
* RMSE for Ridge Regression is least.