In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [2]:
df = np.round(pd.read_csv('50_startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)
df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [3]:
df = df.iloc[:,0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [4]:
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[1,0] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[3,1] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1,-1] = np.NaN


In [5]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


In [6]:
# Step 1 = impute all missing value with respective col mean
df0 = pd.DataFrame()
df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())
df0.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [7]:
# 0th iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [8]:
# Remove col1 imputed value

df1 = df0.copy()
df1.iloc[1,0] = np.NaN

df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [9]:
# take first 3 rows to build model and use last for prediction

x = df1.iloc[[0,2,3,4],1:3]
x

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [10]:
y = df1.iloc[[0,2,3,4],0]
y

21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [11]:
lr = LinearRegression()
lr.fit(x,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))



array([23.14158651])

In [12]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [15]:
df1.iloc[1,0] = 23.14

In [16]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [17]:
# remove the col2 imputed value
df1.iloc[3,1] = np.NaN
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [18]:
# use last three cols for model to predict the value
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [19]:
y = df1.iloc[[0,1,2,4],1]
y

21    23.14
37     5.00
2     10.00
44    15.00
Name: Administration, dtype: float64

In [20]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([13.30231379])

In [21]:
df1.iloc[3,1] = 13.3
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,13.3,26.0
44,2.0,15.0,29.25


In [22]:
# remove the col3 imputed value
df1.iloc[-1,-1] = np.NaN
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,13.3,26.0
44,2.0,15.0,


In [23]:
# use last 3 rows for build the model and last one for prediction
X = df1.iloc[[0,1,2,3],[0,1]]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,23.14
37,23.14,5.0
2,15.0,10.0
14,12.0,13.3


In [24]:
y = df1.iloc[0:4,-1]
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [25]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([53.45837172])

In [26]:
df1.iloc[4,-1] = 53.45

In [28]:
# After first iteration
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,13.3,26.0
44,2.0,15.0,53.45


In [29]:
# substract 0th iteration from 1st iteration
df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,8.14,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,2.05,0.0
44,0.0,0.0,24.2


In [30]:
df2 = df1.copy()
df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,13.3,26.0
44,2.0,15.0,53.45


In [31]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))



array([22.47822671])

In [32]:
df2.iloc[1,0] = 22.47

In [33]:
df2.iloc[3,1] = np.NaN
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([17.98971332])

In [34]:
df2.iloc[3,1] = 17.98

In [35]:
df2.iloc[4,-1] = np.NaN

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))



array([91.82054014])

In [36]:
df2.iloc[4,-1] = 91.82

In [37]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,22.47,5.0,20.0
2,15.0,10.0,41.0
14,12.0,17.98,26.0
44,2.0,15.0,91.82


In [38]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,-0.67,0.0,0.0
2,0.0,0.0,0.0
14,0.0,4.68,0.0
44,0.0,0.0,38.37


In [39]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,17.98,26.0
44,2.0,15.0,91.82


In [40]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))



array([22.26030526])

In [42]:
df3.iloc[1,0] = 22.26

In [43]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))



array([18.39908154])

In [44]:
df3.iloc[3,1] = 18.39

In [45]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))



array([93.39005577])

In [46]:
df3.iloc[4,-1] = 93.39


In [47]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,23.14,30.0
37,22.47,5.0,20.0
2,15.0,10.0,41.0
14,12.0,17.98,26.0
44,2.0,15.0,91.82


In [48]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,-0.21,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.41,0.0
44,0.0,0.0,1.57
