In [80]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [81]:
df = np.round(pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)
df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


This code is generally complex to understand that's why dataset with only 5 rows is taken!


In [82]:
df = df.iloc[:,0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


##Removed Output Column


In [83]:
df.iloc[1,0] = np.nan
df.iloc[3,1] = np.nan
df.iloc[-1,-1] = np.nan

Randomly 3 values one from each column is removed to do the imputation and check the acuracy of our work!

In [84]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


##Step1: Impute all missing Values using mean filling simple imputer

In [85]:
df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

###0th **Iteration**

In [86]:
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


##Step2: Remove the col1 imputed value

In [87]:
df1 = df0.copy()
df1.iloc[1,0] = np.nan
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


Use first 3 rows to build a model and use the last for Prediction


In [88]:
X = df1.iloc[[0,2,3,4],1:3]# these becomes input features
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [89]:
y = df1.iloc[[0,2,3,4],0] # this becomes output column
y

Unnamed: 0,R&D Spend
21,8.0
2,15.0
14,12.0
44,2.0


In [90]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict (df1.iloc[1,1:].values.reshape(1,2))

array([23.14158651])

In [91]:
df1.iloc[1,0] = 23.14
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


##Step3: Do the same for all the other 2 values

Remove the col2 imputed value

In [92]:
df1.iloc[3,1] = np.nan
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


Use last 3 rows to build a model and use the first for prediction

In [93]:
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [94]:
y = df1.iloc[[0,1,2,4],1]
y

Unnamed: 0,Administration
21,15.0
37,5.0
2,10.0
44,15.0


In [95]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))

array([11.06331285])

In [96]:
df1.iloc[3,1] = 11.06
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


Remove the col3 imputed value

In [97]:
df1.iloc[4,-1] = np.nan
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


In [98]:
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [99]:
y = df1.iloc[0:4,-1]
y

Unnamed: 0,Marketing Spend
21,30.0
37,20.0
2,41.0
14,26.0


In [100]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))

array([31.56351448])

In [101]:
df1.iloc[4,-1] = 31.56

###1st Iteration

In [102]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


##Step4: Subtract 0th iteration from 1st iteration

In [103]:
df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


Repeat all the steps for n number of iterations until the difference for all 3 values becomes 0 in each column at same time!

In [104]:
df2 = df1.copy()
df2.iloc[1,0] = np.nan
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))

array([23.78627207])

In [105]:
df2.iloc[1,0] = 23.78

In [106]:
df2.iloc[3,1] = np.nan
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))

array([11.22020174])

In [107]:
df2.iloc[3,1] = 11.22

In [108]:
df2.iloc[4,-1] = np.nan

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))

array([38.87979054])

In [109]:
df2.iloc[4,-1] = 31.56

###2nd **Iteration**

In [110]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [111]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.16,0.0
44,0.0,0.0,0.0


In [112]:
df3 = df2.copy()
df3.iloc[1,0] = np.nan
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [113]:
df3 = df2.copy()
df3.iloc[1,0] = np.nan
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))

array([24.57698058])

In [114]:
df3.iloc[1,0] = 24.57

In [115]:
df3.iloc[3,1] = np.nan
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))

array([11.37282844])

In [116]:
df3.iloc[3,1] = 11.37

In [117]:
df3.iloc[4,-1] = np.nan

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))

array([45.53976417])

In [118]:
df3.iloc[4,-1] = 45.53

###3rd **Iteration**

In [119]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,45.53


In [120]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.79,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.15,0.0
44,0.0,0.0,13.97


In [121]:
df4 = df3.copy()
df4.iloc[1,0] = np.nan
X = df4.iloc[[0,2,3,4],1:3]
y = df4.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[1,1:].values.reshape(1,2))

array([26.98977176])

In [122]:
df4.iloc[1,0] = 26.57

In [123]:
df4.iloc[3,1] = np.nan
X = df4.iloc[[0,1,2,4],[0,2]]
y = df4.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[3,[0,2]].values.reshape(1,2))

array([12.66042893])

In [124]:
df4.iloc[3,1] = 12.66

In [125]:
df4.iloc[4,-1] = np.nan

X = df4.iloc[0:4,0:2]
y = df4.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[4,0:2].values.reshape(1,2))

array([72.0857364])

In [138]:
df4.iloc[4,-1] = 72.08

###4th **Iteration**

In [127]:
df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,26.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,12.66,26.0
44,2.0,15.0,72.08


In [128]:
df4 - df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,2.0,0.0,0.0
2,0.0,0.0,0.0
14,0.0,1.29,0.0
44,0.0,0.0,26.55


In [129]:
df5 = df4.copy()
df5.iloc[1,0] = np.nan
X = df5.iloc[[0,2,3,4],1:3]
y = df5.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df5.iloc[1,1:].values.reshape(1,2))

array([26.27005328])

In [130]:
df5.iloc[1,0] = 26.27

In [131]:
df5.iloc[3,1] = np.nan
X = df5.iloc[[0,1,2,4],[0,2]]
y = df5.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df5.iloc[3,[0,2]].values.reshape(1,2))

array([12.97478242])

In [132]:
df5.iloc[3,1] = 12.97

In [134]:
df5.iloc[4,-1] = np.nan

X = df5.iloc[0:4,0:2]
y = df5.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df5.iloc[4,0:2].values.reshape(1,2))

array([74.83633165])

In [135]:
df5.iloc[4,-1] = 74.83

In [136]:
df5

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,26.27,5.0,20.0
2,15.0,10.0,41.0
14,12.0,12.97,26.0
44,2.0,15.0,74.83


In [139]:
df5 - df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,-0.3,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.31,0.0
44,0.0,0.0,2.75


And so on we get close to Zero