In [551]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [552]:
df = np.round(pd.read_csv(r"E:\Jupyter Notebooks\50_Startups.csv")[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)

In [553]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
Profit             0
dtype: int64

In [554]:
np.random.seed(9) # so that whenever we run this cell the sample values come out to be the same
df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


Yes, you are correct. When you use .iloc (or .loc and similar indexing methods) to extract a subset of a DataFrame without calling .copy(), it creates a view of the original DataFrame rather than a deep copy. This means that any modifications made to this view will reflect in the original DataFrame. This behavior is known as a shallow copy.

In [555]:
# for iterative imputation as well we only keep the x data columns not the output column
x = df.iloc[:,0:-1].copy()
x.sample()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0


In [556]:
# for understanding iterative imputer we'll put missing values

In [557]:
# deliberately inserting missing values in the data 
x.iloc[1,0] = np.NaN
x.iloc[3,1] = np.NaN
x.iloc[-1,-1] = np.NaN

In [558]:
x.isnull().sum()

R&D Spend          1
Administration     1
Marketing Spend    1
dtype: int64

In [559]:
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [560]:
x

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


In [561]:
# Step 1 - Impute all missing values with mean of respective col
df0 = pd.DataFrame()

df0['R&D Spend'] = x['R&D Spend'].fillna(x['R&D Spend'].mean())
df0['Administration'] = x['Administration'].fillna(x['Administration'].mean())
df0['Marketing Spend'] = x['Marketing Spend'].fillna(x['Marketing Spend'].mean())

In [562]:
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


step2 - bring nan to the first imputed columns value apply linear regression or any other model on the remaining rows to predict the value for this missing column's value

In [563]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [564]:
df1.iloc[[0,2,3,4],1:] 

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [565]:
xtrain1 = df1.iloc[[0,2,3,4],1:] 
xtest1 = df1.iloc[1,1:]
ytrain1 = df.iloc[[0,2,3,4],0]

In [566]:
xtest1.values.reshape(1,-1)

array([[ 5., 20.]])

In [567]:
xtest1 # it upon prediction generates below error so we reshape it

Administration      5.0
Marketing Spend    20.0
Name: 37, dtype: float64

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature nam

In [568]:
lr = LinearRegression()
lr.fit(xtrain1,ytrain1)
df1.iloc[1,0] = lr.predict(xtest1.values.reshape(1,-1))



In [569]:
df1.iloc[1,1:].values.reshape(1,2)

array([[ 5., 20.]])

In [570]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.141587,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [571]:
# now we repeat the same steps for the remaining columns missing values


In [572]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
Profit             0
dtype: int64

In [573]:
df1[df1.isnull().any(axis = 1)]#.index

Unnamed: 0,R&D Spend,Administration,Marketing Spend


When you do df1[df1.isnull() == True], pandas tries to use the boolean DataFrame df1.isnull() to filter df1, but this operation doesn't make sense because df1.isnull() is a DataFrame of the same shape as df1, and pandas expects a Series of boolean values (a single column of True or False values) to filter rows.

In [574]:
df1.isnull().any(axis = 1) # this can be used to filter rows from the dataframe

21    False
37    False
2     False
14    False
44    False
dtype: bool

In [575]:
df1[df1.isnull().any(axis = 1)].index#.values

Int64Index([], dtype='int64')

In [576]:
df1[df1.isnull().any(axis = 1)].index.values#[0]

array([], dtype=int64)

In [577]:
df1.isnull()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,False,False,False
37,False,False,False
2,False,False,False
14,False,False,False
44,False,False,False


In [578]:
#df1.iloc[1,0] = np.nan

In [579]:
x 

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


x is the df with null values put deliberately

df0 is the mean imputed x

df1 is the df in which we re-impute only the first col's null value with the predicted value from the remaining rows.

In [580]:
x[x.isnull().any(axis=1)].index

Int64Index([37, 14, 44], dtype='int64')

In [581]:
x[x.isnull().any(axis=1)].columns

Index(['R&D Spend', 'Administration', 'Marketing Spend'], dtype='object')

In [582]:
lst = x[x.isnull().any(axis=1)].index
cols = x[x.isnull().any(axis=1)].columns
x.loc[lst[0],cols[0]]

nan

In [583]:
df1['R&D Spend'][df1['R&D Spend'].isnull()]

Series([], Name: R&D Spend, dtype: float64)

In [584]:
df1.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend'], dtype='object')

In [585]:
#for col in df1.columns:
#    df1[col] = df1[col].fillna(df1[col].mean())

In [586]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.141587,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [587]:
df1.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
count,5.0,5.0,5.0
mean,12.028317,11.25,29.25
std,7.891941,4.145781,7.660777
min,2.0,5.0,20.0
25%,8.0,10.0,26.0
50%,12.0,11.25,29.25
75%,15.0,15.0,30.0
max,23.141587,15.0,41.0


so to the function below we will give x

then we'll store the indices of all null values

then we'll impute it with mean

then for each index :

    reinitialize them to null one by one and perform linear regression on it
    to impute the missing value
    
and in a separate cell we'll call this function n times !

In [588]:
x.dropna()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
2,15.0,10.0,41.0


In [589]:
x[x.isnull().any(axis=1)].index

Int64Index([37, 14, 44], dtype='int64')

In [590]:
x.loc[x[x.isnull().any(axis=1)].index[0],].dropna().values.reshape(1,2)

array([[ 5., 20.]])

In [591]:
x['Administration'].dropna()

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [592]:
# this function will take initial df to fill with mean values
# then performs 10 iterations on the dataset 
# and returns the final dataset

# assumption for my below algo - only 1 cell will be nan in any col

# in each iteration :
# identify places where there are null values
# step1 - fill with mean
# step2 - remove mean from each col 1 by one starting from left - again a loop needed
# step3 - make prediction fr the col whose mean was removed 

'''
def iteration(originalDf):
    nullindexes = originalDf[originalDf.isnull().any(axis=1)].index
    nullcols = originalDf[originalDf.isnull().any(axis=1)].columns
    # impute with mean if it already has null values 
    # this would mean it is the first step
    for col in originalDf.columns:
        originalDf[col] = originalDf[col].fillna(originalDf[col].mean())
    # remove mean imputed val from left to right and keep performing LR
    for indx,col in zip(nullindexes,nullcols):
        originalDf.loc[indx,col] = np.nan
        
        # now predict this val with LR model
        #temp = originalDf.dropna()
        xtrain = originalDf.dropna().drop(columns = col)
        ytrain = originalDf[col].dropna()
        xtest = originalDf.loc[indx,].dropna().values.reshape(1,2)
        
        # debugging
        #print(xtrain.shape)
        #print(ytrain.shape)
        #print(xtest.shape)
        #print(xtest)
        
        lr = LinearRegression()
        lr.fit(xtrain,ytrain)
        originalDf.loc[indx,col] = lr.predict(xtest)
        
    return originalDf
    
'''

'\ndef iteration(originalDf):\n    nullindexes = originalDf[originalDf.isnull().any(axis=1)].index\n    nullcols = originalDf[originalDf.isnull().any(axis=1)].columns\n    # impute with mean if it already has null values \n    # this would mean it is the first step\n    for col in originalDf.columns:\n        originalDf[col] = originalDf[col].fillna(originalDf[col].mean())\n    # remove mean imputed val from left to right and keep performing LR\n    for indx,col in zip(nullindexes,nullcols):\n        originalDf.loc[indx,col] = np.nan\n        \n        # now predict this val with LR model\n        #temp = originalDf.dropna()\n        xtrain = originalDf.dropna().drop(columns = col)\n        ytrain = originalDf[col].dropna()\n        xtest = originalDf.loc[indx,].dropna().values.reshape(1,2)\n        \n        # debugging\n        #print(xtrain.shape)\n        #print(ytrain.shape)\n        #print(xtest.shape)\n        #print(xtest)\n        \n        lr = LinearRegression()\n        lr.

In [593]:
# give this function the df with null values and the numOfIterations to impute iteratively

def iteration(originalDf, numOfIterations):
    nullindexes = originalDf[originalDf.isnull().any(axis=1)].index
    nullcols = originalDf[originalDf.isnull().any(axis=1)].columns
    # impute with mean if it already has null values 
    # this would mean it is the first step
    for col in originalDf.columns:
        originalDf[col] = originalDf[col].fillna(originalDf[col].mean())
    # remove mean imputed val from left to right and keep performing LR
    for i in range(numOfIterations):
        
        for indx,col in zip(nullindexes,nullcols):
                originalDf.loc[indx,col] = np.nan
        
                # now predict this val with LR model
                #temp = originalDf.dropna()
                xtrain = originalDf.dropna().drop(columns = col)
                ytrain = originalDf[col].dropna()
                xtest = originalDf.loc[indx,].dropna().values.reshape(1,2)
        
                lr = LinearRegression()
                lr.fit(xtrain,ytrain)
                originalDf.loc[indx,col] = lr.predict(xtest)
        
    return originalDf

In [594]:
x = iteration(x,10)



In [595]:
x.dropna(axis = 0).drop(columns = 'R&D Spend')

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
37,5.0,20.0
2,10.0,41.0
14,13.022368,26.0
44,15.0,70.692067


In [596]:
x.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
count,5.0,5.0,5.0
mean,12.743673,11.604474,37.538413
std,9.204143,4.220872,20.054338
min,2.0,5.0,20.0
25%,8.0,10.0,26.0
50%,12.0,13.022368,30.0
75%,15.0,15.0,41.0
max,26.718364,15.0,70.692067


In [597]:
x 

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,26.718364,5.0,20.0
2,15.0,10.0,41.0
14,12.0,13.022368,26.0
44,2.0,15.0,70.692067


In [599]:
df # the above imputation in x does not match with the original values and infact has very large diff

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0
