In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\vijet\anaconda3\envs\advpythonnew\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import os

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv(os.path.join('..','divorce_data.csv'),sep=";")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q46,Q47,Q48,Q49,Q50,Q51,Q52,Q53,Q54,Divorce
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


# Select your features (columns)

In [5]:
df.columns

Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31',
       'Q32', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41',
       'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q51',
       'Q52', 'Q53', 'Q54', 'Divorce'],
      dtype='object')

In [6]:
# Set features. This will also be used as your x values.
selected_features = df.drop(columns=['Divorce'])
X=selected_features
X.columns

Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31',
       'Q32', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41',
       'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q51',
       'Q52', 'Q53', 'Q54'],
      dtype='object')

# Create a Train Test Split

Use `Divorce` for the y values

In [10]:
y = df['Divorce']

In [11]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
X_train.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q45,Q46,Q47,Q48,Q49,Q50,Q51,Q52,Q53,Q54
146,2,1,1,0,0,1,0,0,0,0,...,3,1,1,3,0,0,0,0,0,0
137,0,0,1,0,0,0,0,1,1,0,...,3,3,3,3,0,1,3,3,3,1
97,0,0,0,0,0,0,0,0,0,0,...,4,2,1,1,0,0,0,1,0,0
65,3,3,3,2,3,1,1,3,3,2,...,4,3,4,3,4,4,3,4,3,4
36,4,3,3,3,4,1,0,3,3,3,...,4,4,3,4,4,3,3,4,4,3


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
from sklearn.preprocessing import MinMaxScaler
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [14]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf_score = rf.score(X_train_scaled, y_train)
base_accuracy = rf.score(X_test_scaled, y_test)


# model_1 = LogisticRegression()
# model_1.fit(X_train_scaled, y_train)

# model_1_training_score = model_1.score(X_train_scaled, y_train)
# base_accuracy = model_1.score(X_test_scaled, y_test)

print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9767441860465116


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [15]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
import numpy as np
param_grid = {
    'n_estimators': [200, 600],
    'max_features': ['auto']
}
grid = GridSearchCV(rf, param_grid, cv=5, verbose=3)

In [16]:
rf.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [17]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] max_features=auto, n_estimators=200 .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . max_features=auto, n_estimators=200, score=0.962, total=   0.3s
[CV] max_features=auto, n_estimators=200 .............................
[CV] . max_features=auto, n_estimators=200, score=0.962, total=   0.3s
[CV] max_features=auto, n_estimators=200 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] . max_features=auto, n_estimators=200, score=0.960, total=   0.3s
[CV] max_features=auto, n_estimators=200 .............................
[CV] . max_features=auto, n_estimators=200, score=1.000, total=   0.3s
[CV] max_features=auto, n_estimators=200 .............................
[CV] . max_features=auto, n_estimators=200, score=1.000, total=   0.3s
[CV] max_features=auto, n_estimators=600 .............................
[CV] . max_features=auto, n_estimators=600, score=0.962, total=   0.8s
[CV] max_features=auto, n_estimators=600 .............................
[CV] . max_features=auto, n_estimators=600, score=0.962, total=   0.7s
[CV] max_features=auto, n_estimators=600 .............................
[CV] . max_features=auto, n_estimators=600, score=0.960, total=   0.8s
[CV] max_features=auto, n_estimators=600 .............................
[CV] . max_features=auto, n_estimators=600, score=1.000, total=   0.7s
[CV] max_features=auto, n_estimators=600 .............................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.0s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'max_features': ['auto'], 'n_estimators': [200, 600]},
             verbose=3)

In [18]:
print(grid.best_params_)
print(grid.best_score_)

{'max_features': 'auto', 'n_estimators': 200}
0.9766153846153847


In [19]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)
predictions

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
      dtype=int64)

In [20]:
y_test

139    0
30     1
119    0
29     1
144    0
163    0
166    0
51     1
105    0
60     1
15     1
158    0
135    0
45     1
68     1
85     0
24     1
109    0
75     1
108    0
19     1
16     1
31     1
18     1
12     1
9      1
82     1
98     0
76     1
55     1
42     1
56     1
150    0
111    0
69     1
138    0
2      1
93     0
136    0
156    0
90     0
114    0
26     1
Name: Divorce, dtype: int64

In [21]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.96      0.98        23

    accuracy                           0.98        43
   macro avg       0.98      0.98      0.98        43
weighted avg       0.98      0.98      0.98        43



# Save the Model

In [22]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'decisiontree.sav'
joblib.dump(rf, filename)

['decisiontree.sav']