# Applying Datascience to Matrimoney

## Making Basic Imports

In [1]:
import numpy as np
import pandas as pd
import datetime
import os, glob
import math

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams["figure.figsize"] = (20,10)

pd.set_option('display.max_columns', None)

## Reading the data from both csv and excel files but we'll only use the csv data.

In [2]:
df_csv = pd.read_csv("Marriage_Divorce_DB.csv")
df_xl = pd.read_excel("Marriage_Divorce_DB.xlsx")

## Checking for null values to see if the data is clean of not

In [3]:
df_csv.isnull().sum()

Age Gap                                         0
Education                                       0
Economic Similarity                             0
Social Similarities                             0
Cultural Similarities                           0
Social Gap                                      0
Common Interests                                0
Religion Compatibility                          0
No of Children from Previous Marriage           0
Desire to Marry                                 0
Independency                                    0
Relationship with the Spouse Family             0
Trading in                                      0
Engagement Time                                 0
Love                                            0
Commitment                                      0
Mental Health                                   0
The Sense of Having Children                    0
Previous Trading                                0
Previous Marriage                               0


### We see that we are not missing any data points and so we could go ahead with exploring the data

## Exploring the data

In [4]:
df_csv.shape

(100, 31)

#### This means we have a 100 observations and 30 features and 1 target variable 

In [5]:
df_csv.head()

Unnamed: 0,Age Gap,Education,Economic Similarity,Social Similarities,Cultural Similarities,Social Gap,Common Interests,Religion Compatibility,No of Children from Previous Marriage,Desire to Marry,Independency,Relationship with the Spouse Family,Trading in,Engagement Time,Love,Commitment,Mental Health,The Sense of Having Children,Previous Trading,Previous Marriage,The Proportion of Common Genes,Addiction,Loyalty,Height Ratio,Good Income,Self Confidence,Relation with Non-spouse Before Marriage,Spouse Confirmed by Family,Divorce in the Family of Grade 1,Start Socializing with the Opposite Sex Age,Divorce Probability
0,0.111633,1.915111,10.998678,76.456065,47.84746,50.317656,88.099898,83.738075,4.402822,22.868019,1.269738,73.206949,79.26236,5.424734,70.472234,76.106833,70.241804,86.138461,39.437387,2.766927,21.032882,3.134119,49.64848,30.822948,94.499164,45.964824,2.03261,1.719332,2.262242,24.356772,2.76019
1,3.355384,2.957842,82.13812,48.656031,30.188517,54.114612,57.020971,98.408133,4.367024,40.336843,1.658179,91.666091,63.763287,4.371315,60.818393,70.943766,80.029002,60.668227,26.251231,1.308314,41.257627,2.067377,75.220699,68.268221,41.102605,65.387715,1.053402,1.456192,9.795998,19.667152,1.962979
2,6.527365,2.772463,26.337826,59.356238,10.340252,76.595377,80.590985,41.743462,1.19712,45.941845,1.766594,17.926501,65.037501,6.514788,52.601832,96.092606,80.383938,28.225648,7.228832,4.203533,23.917319,3.599095,22.551866,59.134874,23.053577,84.271897,8.268308,7.095241,9.986173,15.522517,2.858803
3,5.203075,1.729242,66.956033,5.472612,1.003407,55.071435,99.718078,70.493011,3.392041,2.924863,1.065769,17.036749,32.493249,1.701053,80.600086,64.425022,63.755047,45.348339,30.312101,2.11797,46.05644,1.549274,99.172136,40.984117,43.40004,96.081229,5.852371,6.570749,5.099396,34.665933,1.404621
4,6.864962,4.37029,76.245035,26.797234,93.291581,73.736241,52.896199,11.729729,2.373553,89.851492,1.103652,81.65681,82.310898,9.012912,72.730718,73.944053,61.696215,98.688798,58.777743,1.151556,2.743993,4.031738,21.629472,89.122381,51.615509,53.330824,9.717223,7.609152,1.294295,22.545763,1.318819


In [6]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Age Gap                                       100 non-null    float64
 1   Education                                     100 non-null    float64
 2   Economic Similarity                           100 non-null    float64
 3   Social Similarities                           100 non-null    float64
 4   Cultural Similarities                         100 non-null    float64
 5   Social Gap                                    100 non-null    float64
 6   Common Interests                              100 non-null    float64
 7   Religion Compatibility                        100 non-null    float64
 8   No of Children from Previous Marriage         100 non-null    float64
 9   Desire to Marry                               100 non-null    floa

In [7]:
cols = df_csv.columns
print(cols)

Index(['Age Gap', 'Education', 'Economic Similarity', 'Social Similarities',
       'Cultural Similarities', 'Social Gap', 'Common Interests',
       'Religion Compatibility', 'No of Children from Previous Marriage',
       'Desire to Marry', 'Independency',
       'Relationship with the Spouse Family', 'Trading in', 'Engagement Time',
       'Love', 'Commitment', 'Mental Health', 'The Sense of Having Children',
       'Previous Trading', 'Previous Marriage',
       'The Proportion of Common Genes', 'Addiction', 'Loyalty',
       'Height Ratio', 'Good Income', 'Self Confidence',
       'Relation with Non-spouse Before Marriage',
       'Spouse Confirmed by Family', 'Divorce in the Family of Grade 1',
       'Start Socializing with the Opposite Sex Age ', 'Divorce Probability'],
      dtype='object')


In [8]:
# Replacing the blanks with underscore so as to better suit the programming parlance 
df_csv.columns = df_csv.columns.str.replace(' |-', '_', regex=True)

In [9]:
df_csv.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age_Gap,100.0,5.037214,2.566534,0.111633,3.153033,4.961566,6.904617,9.733494
Education,100.0,2.92262,1.105506,1.013645,2.030941,2.753493,3.91012,4.934219
Economic_Similarity,100.0,57.034788,27.259673,1.613919,36.64932,60.182815,79.167894,98.862683
Social_Similarities,100.0,55.238271,28.169047,2.521826,31.860366,58.038692,77.041343,99.892423
Cultural_Similarities,100.0,45.354729,28.455983,1.003407,20.722177,41.01617,69.217013,99.753878
Social_Gap,100.0,52.610334,29.470198,3.646218,25.954739,52.240423,83.618457,99.538187
Common_Interests,100.0,74.998784,14.075152,52.306687,62.76256,74.259219,85.019345,99.718078
Religion_Compatibility,100.0,52.165466,30.141349,1.138927,26.676899,53.163896,77.570105,99.986393
No_of_Children_from_Previous_Marriage,100.0,3.004066,1.194009,1.025752,2.100129,3.048906,4.156573,4.964397
Desire_to_Marry,100.0,47.692285,28.535831,2.924863,23.728733,44.910947,68.517372,99.948651


## Setting X as the features and y as the taget i.e. Divorce Probability

In [10]:
X , y = df_csv.iloc[:,:-1], df_csv.iloc[:,-1]

## Insights from over seeing the data: 
### We see that all the data is in float format rather than in string format, which makes it easier to deal with. 

## Now we try different feature resuction techniques to determine if we could drop some parameters that are not important to our target variable i.e. Divorce Probability

In [11]:
# We are goin to be runnning 3 different tests for feature selection. They are namely Boruta, 
# Chi square test(Pearson's coefficient) and Decision Trees.
# Lets make the required imports to run them.

 
from boruta import BorutaPy
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel 
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

## First lets scale our data using the Standard or MinMaxScaler

In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [13]:
# You can choose either the standard scaler or the MinMaxScaler. I have selected MinMax, choice of which is just random. 

# scaler = StandardScaler()
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)
# Last column i.e. Divorce Probability is the target column so I have placed that in y while rest all are features that go in data X.



## We could run a test train split on the data here, but as we are not building a model but just selecting features, I felt it was okay not to create a split. I have commented the code below that gives us a train test split. Incase you are building a model, please make sure to create a split and then predict the test results using the test data.  

In [14]:
# # Preparing the data by creating a test train split
# from sklearn.model_selection import train_test_split


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

## Now we will run a random forest regressor to get the important features

In [15]:
model_rf = SelectFromModel(RandomForestRegressor())

model_rf.fit(X_scaled,y)

In [16]:
model_rf.transform(X_scaled)

array([[2.29932072e-01, 9.65026034e-02, 8.57419227e-01, 7.24562005e-01,
        4.12411455e-01, 5.23563377e-01, 3.70996378e-01, 9.63628948e-01,
        1.39566906e-01, 3.73548699e-01],
       [4.95896009e-01, 8.28022874e-01, 8.48330522e-01, 9.15259725e-01,
        8.38968061e-01, 2.50098807e-01, 6.95791021e-01, 4.03667305e-01,
        9.78814788e-01, 1.82341048e-01],
       [4.48612536e-01, 2.54233630e-01, 4.35094596e-02, 1.53470726e-01,
        4.73246611e-01, 6.42762299e-01, 2.68402615e-02, 2.14389887e-01,
        1.00000000e+00, 1.33537715e-02],
       [1.82523479e-01, 6.71906886e-01, 6.00787588e-01, 1.44278879e-01,
        9.40178993e-01, 1.17280591e-01, 1.00000000e+00, 4.27760166e-01,
        4.55621218e-01, 7.93879317e-01],
       [8.56161570e-01, 7.67424826e-01, 3.42199361e-01, 8.11855839e-01,
        2.66836515e-02, 7.53672587e-01, 1.51248764e-02, 5.13914542e-01,
        3.17392865e-02, 2.99709269e-01],
       [2.01879744e-01, 4.86543286e-01, 8.16320056e-01, 3.62091773e-01,
   

In [17]:
df_feat_impt = pd.DataFrame({"Feature Name" : X.columns , "Importance" : model_rf.get_support()})

In [18]:
df_feat_impt

Unnamed: 0,Feature Name,Importance
0,Age_Gap,False
1,Education,True
2,Economic_Similarity,True
3,Social_Similarities,False
4,Cultural_Similarities,False
5,Social_Gap,False
6,Common_Interests,False
7,Religion_Compatibility,False
8,No_of_Children_from_Previous_Marriage,True
9,Desire_to_Marry,False


In [19]:
rf_feats = [x for i,x in enumerate(df_feat_impt["Feature Name"]) if df_feat_impt.iloc[i,-1] == True]

In [20]:
# We then print a list of important features according to RandomForest Regressor
rf_feats

['Education',
 'Economic_Similarity',
 'No_of_Children_from_Previous_Marriage',
 'Relationship_with_the_Spouse_Family',
 'The_Proportion_of_Common_Genes',
 'Addiction',
 'Loyalty',
 'Good_Income',
 'Divorce_in_the_Family_of_Grade_1',
 'Start_Socializing_with_the_Opposite_Sex_Age_']

## Now we run boruta on top of RandomForestRegressor to get another list of features

In [21]:
# We need to convert our features and target into array or martix format. Boruta doesn't work with dataframe format.
features = np.array(X_scaled)
target = np.array(y)

In [22]:
#We will run Boruta on top of a Random Forest Classifier
rf = RandomForestRegressor()

In [23]:
boruta_feature_selector = BorutaPy(rf, random_state=200, max_iter=50, verbose=2, perc=75)
boruta_feature_selector.fit(features, target)

Iteration: 	1 / 50
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	2 / 50
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	3 / 50
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	4 / 50
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	5 / 50
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	6 / 50
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	7 / 50
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	8 / 50
Confirmed: 	3
Tentative: 	12
Rejected: 	15
Iteration: 	9 / 50
Confirmed: 	3
Tentative: 	12
Rejected: 	15
Iteration: 	10 / 50
Confirmed: 	3
Tentative: 	12
Rejected: 	15
Iteration: 	11 / 50
Confirmed: 	3
Tentative: 	12
Rejected: 	15
Iteration: 	12 / 50
Confirmed: 	4
Tentative: 	11
Rejected: 	15
Iteration: 	13 / 50
Confirmed: 	4
Tentative: 	11
Rejected: 	15
Iteration: 	14 / 50
Confirmed: 	4
Tentative: 	11
Rejected: 	15
Iteration: 	15 / 50
Confirmed: 	4
Tentative: 	11
Rejected: 	15
Iteration: 	16 / 50
Confirmed: 	4
Tentative: 	10
Rejected: 	16
Iteratio

In [24]:
df_feat_impt = pd.DataFrame({"Feature Name" : X.columns , "Importance" : boruta_feature_selector.support_})
df_feat_impt

Unnamed: 0,Feature Name,Importance
0,Age_Gap,False
1,Education,True
2,Economic_Similarity,False
3,Social_Similarities,True
4,Cultural_Similarities,False
5,Social_Gap,False
6,Common_Interests,False
7,Religion_Compatibility,False
8,No_of_Children_from_Previous_Marriage,True
9,Desire_to_Marry,False


In [25]:
boruta_feats = [x for i,x in enumerate(df_feat_impt["Feature Name"]) if df_feat_impt.iloc[i,-1] == True]

In [26]:
# Important features according to Boruta
boruta_feats

['Education',
 'Social_Similarities',
 'No_of_Children_from_Previous_Marriage',
 'The_Proportion_of_Common_Genes',
 'Addiction',
 'Loyalty',
 'Good_Income']

## RFE (Recursive Feature Elimination) to decide the features that are important

In [27]:
# We are using support vector regressor as model and top it with RFE to get the best 10 features.
svm = LinearSVR()
rfe = RFE(svm ,n_features_to_select=10)

In [28]:
rfe.fit(X_scaled,y)



In [29]:
df_feat_impt = pd.DataFrame({"Feature Name" : X.columns , "Importance" : rfe.support_})
df_feat_impt

Unnamed: 0,Feature Name,Importance
0,Age_Gap,False
1,Education,True
2,Economic_Similarity,False
3,Social_Similarities,False
4,Cultural_Similarities,True
5,Social_Gap,True
6,Common_Interests,False
7,Religion_Compatibility,False
8,No_of_Children_from_Previous_Marriage,False
9,Desire_to_Marry,True


In [30]:
rfe_feats = [x for i,x in enumerate(df_feat_impt["Feature Name"]) if df_feat_impt.iloc[i,-1] == True]

In [31]:
rfe_feats

['Education',
 'Cultural_Similarities',
 'Social_Gap',
 'Desire_to_Marry',
 'Relationship_with_the_Spouse_Family',
 'Commitment',
 'Mental_Health',
 'Addiction',
 'Relation_with_Non_spouse_Before_Marriage',
 'Start_Socializing_with_the_Opposite_Sex_Age_']

In [32]:
df = pd.DataFrame({"Boruta": pd.Series(boruta_feats), "RandomForest": pd.Series(rf_feats),"RFE": pd.Series(rfe_feats)})

In [33]:
most_imp_features = []
for i,name in enumerate(df["RandomForest"]):
    if (name in list(df["Boruta"])) and (name in list(df["RFE"])):
        most_imp_features.append(name)

In [34]:
most_imp_features

['Education', 'Addiction']

In [35]:
df

Unnamed: 0,Boruta,RandomForest,RFE
0,Education,Education,Education
1,Social_Similarities,Economic_Similarity,Cultural_Similarities
2,No_of_Children_from_Previous_Marriage,No_of_Children_from_Previous_Marriage,Social_Gap
3,The_Proportion_of_Common_Genes,Relationship_with_the_Spouse_Family,Desire_to_Marry
4,Addiction,The_Proportion_of_Common_Genes,Relationship_with_the_Spouse_Family
5,Loyalty,Addiction,Commitment
6,Good_Income,Loyalty,Mental_Health
7,,Good_Income,Addiction
8,,Divorce_in_the_Family_of_Grade_1,Relation_with_Non_spouse_Before_Marriage
9,,Start_Socializing_with_the_Opposite_Sex_Age_,Start_Socializing_with_the_Opposite_Sex_Age_
