# Import Libraries

In [188]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Import Dataset with Divorces Data

In [189]:
divorces = pd.read_csv(
    filepath_or_buffer='./../data/divorce.csv'
)

divorces

Unnamed: 0,divorce_date,dob_man,education_man,income_man,dob_woman,education_woman,income_woman,marriage_date,marriage_duration,num_kids
0,2006-09-06,1975-12-18,Secondary,2000.0,1983-08-01,Secondary,1800.0,2000-06-26,5.0,1.0
1,2008-01-02,1976-11-17,Professional,6000.0,1977-03-13,Professional,6000.0,2001-09-02,7.0,
2,2011-01-02,1969-04-06,Preparatory,5000.0,1970-02-16,Professional,5000.0,2000-02-02,2.0,2.0
3,2011-01-02,1979-11-13,Secondary,12000.0,1981-05-13,Secondary,12000.0,2006-05-13,2.0,
4,2011-01-02,1982-09-20,Professional,6000.0,1988-01-30,Professional,10000.0,2007-08-06,3.0,
...,...,...,...,...,...,...,...,...,...,...
2204,2006-10-31,1969-11-27,Professional,6000.0,1975-09-01,Professional,14000.0,2002-03-08,4.0,
2205,2006-10-31,1970-09-17,Professional,6000.0,1977-07-22,Professional,6000.0,2002-03-22,4.0,
2206,2011-10-31,1976-03-11,Professional,8000.0,1978-12-28,Professional,7000.0,2009-02-10,2.0,
2207,2012-10-31,1977-10-29,Professional,12500.0,1978-12-01,Professional,6000.0,2008-10-25,3.0,


# Check Missing Data

In [190]:
divorces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2209 entries, 0 to 2208
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   divorce_date       2209 non-null   object 
 1   dob_man            2209 non-null   object 
 2   education_man      2205 non-null   object 
 3   income_man         2209 non-null   float64
 4   dob_woman          2209 non-null   object 
 5   education_woman    2209 non-null   object 
 6   income_woman       2209 non-null   float64
 7   marriage_date      2209 non-null   object 
 8   marriage_duration  2209 non-null   float64
 9   num_kids           1333 non-null   float64
dtypes: float64(4), object(6)
memory usage: 172.7+ KB


In [191]:
divorces.isna().sum()

divorce_date           0
dob_man                0
education_man          4
income_man             0
dob_woman              0
education_woman        0
income_woman           0
marriage_date          0
marriage_duration      0
num_kids             876
dtype: int64

In [192]:
divorces['education_man'].value_counts(
    dropna=False
)

education_man
Professional    1313
Preparatory      501
Secondary        288
Primary          100
NaN                4
Other              3
Name: count, dtype: int64

In [193]:
divorces[divorces['education_man'].isna()]

Unnamed: 0,divorce_date,dob_man,education_man,income_man,dob_woman,education_woman,income_woman,marriage_date,marriage_duration,num_kids
246,2011-04-07,1979-08-04,,3600.0,1976-05-02,Primary,3400.0,1997-07-18,13.0,2.0
635,2008-09-10,1967-02-23,,5000.0,1970-04-02,Secondary,1600.0,1988-03-10,6.0,3.0
1665,2014-06-23,1983-01-01,,1700.0,1989-01-01,Secondary,1700.0,2006-06-09,8.0,1.0
1753,2012-01-25,1978-07-09,,3000.0,1974-03-23,Preparatory,4000.0,2010-10-12,1.0,


### If missing data represents <5% we can delete them without consequentions

In [194]:
len(divorces[divorces['education_man'].isna()]) / len(divorces)

0.0018107741059302852

In [195]:
len(divorces[divorces['education_man'].isna()]) / len(divorces) < 0.05

True

In [196]:
divorces['num_kids'].value_counts(
    dropna=False
)

num_kids
NaN    876
1.0    621
2.0    550
3.0    142
4.0     17
5.0      3
Name: count, dtype: int64

In [197]:
divorces[divorces['num_kids'].isna()]

Unnamed: 0,divorce_date,dob_man,education_man,income_man,dob_woman,education_woman,income_woman,marriage_date,marriage_duration,num_kids
1,2008-01-02,1976-11-17,Professional,6000.0,1977-03-13,Professional,6000.0,2001-09-02,7.0,
3,2011-01-02,1979-11-13,Secondary,12000.0,1981-05-13,Secondary,12000.0,2006-05-13,2.0,
4,2011-01-02,1982-09-20,Professional,6000.0,1988-01-30,Professional,10000.0,2007-08-06,3.0,
5,2012-01-02,1973-09-07,Professional,20000.0,1974-05-20,Professional,15.0,2000-03-08,11.0,
11,2010-01-03,1987-05-04,Preparatory,2000.0,1986-09-21,Preparatory,2000.0,2005-07-12,4.0,
...,...,...,...,...,...,...,...,...,...,...
2204,2006-10-31,1969-11-27,Professional,6000.0,1975-09-01,Professional,14000.0,2002-03-08,4.0,
2205,2006-10-31,1970-09-17,Professional,6000.0,1977-07-22,Professional,6000.0,2002-03-22,4.0,
2206,2011-10-31,1976-03-11,Professional,8000.0,1978-12-28,Professional,7000.0,2009-02-10,2.0,
2207,2012-10-31,1977-10-29,Professional,12500.0,1978-12-01,Professional,6000.0,2008-10-25,3.0,


### NaN in this case represents no kids

# Handle Missing Data

In [198]:
len(divorces)

2209

In [199]:
divorces = divorces.dropna(
    subset='education_man'
)

In [200]:
len(divorces)

2205

In [201]:
divorces['education_man'].value_counts(
    dropna=False
)

education_man
Professional    1313
Preparatory      501
Secondary        288
Primary          100
Other              3
Name: count, dtype: int64

In [202]:
divorces.loc[:, ['num_kids']] = divorces['num_kids'].fillna(
    value=0
)

In [203]:
divorces['num_kids'].value_counts(
    dropna=False
)

num_kids
0.0    875
1.0    620
2.0    549
3.0    141
4.0     17
5.0      3
Name: count, dtype: int64