In [28]:

# There are various ways to deal with missing data points.
# You can simply drop records if they contain any nulls.
# data.dropna()
# You can fill nulls with zeros
# data.fillna(0)
# You can also fill with mean, median, or do a forward-fill or back-fill.
# The problems with all of these options, is that if you have a lot of missing values for one specific feature, you won't be able to do very reliable predictive analytics.
# A viable alternative is to impute missing values using some machine learning techniques (regression or classification).
import pandas as pd
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()


In [29]:

data = pd.read_csv('C:\\Users\\ryans\\seaborn-data\\titanic.csv')
print(data)
list(data)
data.dtypes


     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male deck  embark_town alive  alo

survived         int64
pclass           int64
sex             object
age            float64
sibsp            int64
parch            int64
fare           float64
embarked        object
class           object
who             object
adult_male        bool
deck            object
embark_town     object
alive           object
alone             bool
dtype: object

In [30]:

# Now, we will use a simple regression technique to predict the missing values
data_with_null = data[['survived','pclass','sibsp','parch','fare','age']]

data_without_null = data_with_null.dropna()
train_data_x = data_without_null.iloc[:,:5]
train_data_y = data_without_null.iloc[:,5]

linreg.fit(train_data_x,train_data_y)

test_data = data_with_null.iloc[:,:5]
age = pd.DataFrame(linreg.predict(test_data))

# check for nulls
data_with_null.apply(lambda x: sum(x.isnull()),axis=0)


survived      0
pclass        0
sibsp         0
parch         0
fare          0
age         177
dtype: int64

In [31]:

# find missing per feature
print(data_with_null.isnull().sum())


survived      0
pclass        0
sibsp         0
parch         0
fare          0
age         177
dtype: int64


In [34]:

# find any/all missing data points in entire data set
print(data_with_null.isnull().sum().sum())


177


In [36]:

# F
age = list(linreg.predict(test_data))
print(age)


[25.126882888641795, 33.453537042712476, 22.330077722422416, 33.72473693339533, 29.109740423749276, 29.103650719715034, 44.51951037819127, 16.139107707844957, 20.698227332273277, 26.03660050661579, 17.41241153056678, 38.11551365864848, 29.109740423749276, 20.808557070530767, 29.11266073736648, 30.241252728846426, 12.024254383604422, 30.28599706414577, 24.966549020485814, 22.340518067325597, 36.87363199314628, 30.28599706414577, 22.328523602509687, 37.98202639167211, 16.139107707844957, 14.025352109326043, 29.122045115956595, 27.80207396670547, 22.33076081927465, 29.11204028258366, 44.87957848468999, 32.33138640035102, 22.33268780864821, 37.10481105885955, 40.07267944115294, 40.52267023830275, 22.340455425256177, 29.109740423749276, 20.971759662631847, 18.28582051893933, 25.093697506628114, 32.95341652745789, 29.11204028258366, 24.280955963993957, 22.33076081927465, 29.109740423749276, 25.0038359665686, 22.33268780864821, 20.916885209820727, 24.969531976172437, 11.866717036404651, 29.11

In [None]:

# Finally, we will join our predicted values back into the 'data_with_null' dataframe
data_with_null.age = age


In [40]:

# check for nulls
data_with_null.apply(lambda x: sum(x.isnull()),axis=0)


survived    0
pclass      0
sibsp       0
parch       0
fare        0
age         0
dtype: int64