In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv("/content/50_Startups.csv")

In [3]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
data=data.drop(columns='State')  # We want to work on numeric data
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,165349.2,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [5]:
data=np.round(data/10000) # Dividing by 10,000 is done only to scale down large numbers

In [6]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,17.0,14.0,47.0,19.0
1,16.0,15.0,44.0,19.0
2,15.0,10.0,41.0,19.0
3,14.0,12.0,38.0,18.0
4,14.0,9.0,37.0,17.0


In [7]:
np.random.seed(9) #repeatable and consistent result

In [8]:
data=data.sample(5)

In [9]:
data=data.iloc[:,:-1]

In [10]:
data

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [11]:
### Want to apply iterative Imputation for that we remove some value with nan

In [12]:
data.iloc[1,0]=np.nan
data.iloc[3,1]=np.nan
data.iloc[4,2]=np.nan

In [13]:
data

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


# **Step 1 - Impute all missing values with mean of respective col**

In [14]:
data0=pd.DataFrame()

data0['R&D Spend']=	data['R&D Spend'].fillna(data["R&D Spend"].mean())
data0['Administration']=data['Administration'].fillna(data['Administration'].mean())
data0['Marketing Spend']=data['Marketing Spend'].fillna(data['Marketing Spend'].mean())

In [15]:
# 0th Iteration (impute missing value with mean of column )
data0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


# **Remove the col1 imputed value**

In [16]:
data1=data0.copy()

data1.iloc[1,0]=np.nan

In [17]:
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


#### **1.Use column 0 (R&D Spend) as the output y and use the remaining columns as X.**

### **2.The row containing the NaN value will be used as the test data.**

In [18]:
## Train on non-NaN rows and predict on the NaN row.

In [19]:
X=data1.iloc[[0,2,3,4],1:3]

In [20]:
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [21]:
y=data1.iloc[[0,2,3,4],0]

In [22]:
y

Unnamed: 0,R&D Spend
21,8.0
2,15.0
14,12.0
44,2.0


In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
LR=LinearRegression()
LR.fit(X,y)
LR.predict(data1.iloc[[1],1:3].values.reshape(1,2))



array([23.14158651])

In [25]:
data1.iloc[1,0]=23.14

In [26]:
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


# **Remove the col2 imputed value**

In [27]:
data1.iloc[3,1]=np.nan
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [28]:
X=data1.iloc[[0,1,2,4],[0,2]]

In [29]:
y=data1.iloc[[0,1,2,4],1]

In [30]:
y

Unnamed: 0,Administration
21,15.0
37,5.0
2,10.0
44,15.0


In [31]:
lr=LinearRegression()
lr.fit(X,y)
lr.predict(data1.iloc[3,[0,2]].values.reshape(1,2))



array([11.06331285])

In [32]:
data1.iloc[3,1]=11.06
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


In [33]:
data1.iloc[4,2]=np.nan

In [34]:
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


In [35]:
X=data1.iloc[[0,1,2,3],0:-1]

In [36]:
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [37]:
y=data1.iloc[0:4,-1]

In [38]:
y

Unnamed: 0,Marketing Spend
21,30.0
37,20.0
2,41.0
14,26.0


In [39]:
lr=LinearRegression()
lr.fit(X,y)
lr.predict(data1.iloc[4,0:2].values.reshape(1,2))



array([31.56351448])

In [40]:
data1.iloc[4,2]=31.56

In [41]:
# 1st Iteration
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [42]:
# Subtract 0th iteration from 1st iteration
data1-data0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


In [43]:
data2=data1.copy()
data2.iloc[1,0]=np.nan
data2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [44]:
X=data2.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.06,26.0
44,15.0,31.56


In [45]:
y=data2.iloc[[0,2,3,4],0]
y

Unnamed: 0,R&D Spend
21,8.0
2,15.0
14,12.0
44,2.0


In [46]:
lr=LinearRegression()
lr.fit(X,y)
lr.predict(data2.iloc[1,1:].values.reshape(1,2))



array([23.78627207])

In [47]:
data2.iloc[1,0]=23.78

In [48]:
data2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [49]:
data2.iloc[3,1]=np.nan

In [50]:
data2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,31.56


In [51]:
X=data2.iloc[[0,1,2,4],[0,2]]
y=data2.iloc[[0,1,2,4],1]

In [52]:
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.78,20.0
2,15.0,41.0
44,2.0,31.56


In [53]:
y

Unnamed: 0,Administration
21,15.0
37,5.0
2,10.0
44,15.0


In [54]:
lr=LinearRegression()
lr.fit(X,y)
lr.predict(data2.iloc[3,[0,2]].values.reshape(1,2))



array([11.22020174])

In [55]:
data2.iloc[3,1]=11.22

In [56]:
data2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [57]:
data2.iloc[4,-1]=np.nan

In [58]:
data2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,


In [59]:
X=data2.iloc[[0,1,2,3],0:2]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.78,5.0
2,15.0,10.0
14,12.0,11.22


In [60]:
y=data2.iloc[[0,1,2,3],-1]
y

Unnamed: 0,Marketing Spend
21,30.0
37,20.0
2,41.0
14,26.0


In [61]:
lr=LinearRegression()
lr.fit(X,y)
lr.predict(data2.iloc[4,0:2].values.reshape(1,2))



array([38.87979054])

In [62]:
data2.iloc[4,-1]=38.87

In [63]:
data2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,38.87


In [64]:
data2-data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.16,0.0
44,0.0,0.0,7.31
