In [108]:
from sklearn.model_selection import train_test_split
import numpy as np
import logistic_regression
import pandas as pd

# Loading data
data = pd.read_csv("Data PCOS/data without infertility _final.csv")

# Create a data frame 
data = pd.DataFrame(data)
del data[data.columns[-1]] # Removing last column (error)

# Delete the paciente identification columns
header_raw = list(data)
header_del_elements_1 = ['Sl. No', 'Patient File No.']
data_model_1 = data.copy()
data_model_1.drop(columns=header_del_elements_1, inplace=True)

# Convert to numpy array
data_model_1 = data_model_1.to_numpy()
y_1 = data_model_1[:, 0]
X_1 = data_model_1[:, 1:]

# Remove nan values from the data
y_1 = y_1[np.squeeze(~np.isnan(X_1).any(axis=1))]
X_1 = X_1[~np.isnan(X_1).any(axis=1)]

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X_1, y_1, test_size=0.30, random_state=0)

**Z-score normalization**

In [109]:
train_mean_1 = np.mean(X_train_1, axis=0)
train_std_1 = np.std(X_train_1, axis=0)
X_train_norm_1 = (X_train_1 - train_mean_1) / train_std_1

Lets apply the gradient descent in the data using all the parameter in the linear form

In [110]:
w_1, b_1, J_history_1, w_history_1 = logistic_regression.gradient_descent_reg(X_train_norm_1, y_train_1, np.zeros((39,)), 0, 0.2, 450, 1e-5 )

Iteration    0: Cost     0.60   
Iteration   45: Cost     0.22   
Iteration   90: Cost     0.18   
Iteration  135: Cost     0.17   
Iteration  180: Cost     0.16   
Iteration  225: Cost     0.16   
Iteration  270: Cost     0.15   
Iteration  315: Cost     0.15   
Iteration  360: Cost     0.15   
Iteration  405: Cost     0.15   
Iteration  449: Cost     0.15   


**Accuracy**

Considering accuracy the percentage of correct predictions

In [111]:
# Normalize the test data
X_test_norm_1 = (X_test_1 - train_mean_1) / train_std_1

# Compute the accurace
f_wb_1, g_1 = logistic_regression.predict(X_test_norm_1, w_1, b_1)
accuracy_1 = np.mean(np.where(f_wb_1 == y_test_1, 1, 0))
print('Accuracy before removing features:',round(accuracy_1 ,3) * 100,'%')

Accuracy before removing features: 84.0 %


**Positive predictive value(PPV)** is the probability that a person who test positive actually has the disease.
The PPV is a good measure of confiability of a test, lets calculate it for the model.

In [112]:
y_test_positive_1 = y_test_1[f_wb_1 == 1]
ppv_1 = np.mean(y_test_positive_1)
print('Positive predictive value:',round(ppv_1,3) * 100, '%')

Positive predictive value: 87.2 %


**Negative predictive value (NPV)** is the probability that a person who test negative actually not having the disease

In [113]:
y_test_negative_1 = y_test_1[f_wb_1 == 0]
npv_1 = np.mean(1 - y_test_negative_1)
print('Negative predictive value:',round(npv_1, 3) * 100, '%')

Negative predictive value: 82.6 %


**Linear model with all the parameter conclusion**

As shown above, the model using all the parameters and just linear components have a low accurace of just 0.44, lets study the parameter to increase the prediction accurace.

### Feature analysis for best fit
Print features used in previous prediction:

In [114]:
header = [element for element in header_raw if element not in header_del_elements_1]
print(header)

['PCOS (Y/N)', ' Age (yrs)', 'Weight (Kg)', 'Height(Cm) ', 'BMI', 'Blood Group', 'Pulse rate(bpm) ', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)', 'Pregnant(Y/N)', 'No. of aborptions', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)', 'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)', 'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)']


We can see that some of the data have clear no correlation with the study, so lets remove the following:
* Age (1)
* Pulse rate (7)
* RR
* Marraige Status

Data remove because of correlation:
* BMI (keep)
  * Weight
  * Height
* Hip Ratio (keep)
  * Hip
  * Waist

**Removing the data**

In [115]:
# Creating parameter for a new model
header_del_elements_2 = ['Sl. No', 'Patient File No.',' Age (yrs)', 'Pulse rate(bpm) ',
'RR (breaths/min)', 'Marraige Status (Yrs)','Weight (Kg)', 'Height(Cm) ', 'Hip(inch)', 
'Waist(inch)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)','Blood Group']

data_model_2 = data.copy()
data_model_2.drop(columns=header_del_elements_2, inplace=True)

# Convert to numpy array
data_model_2 = data_model_2.to_numpy()

y_2 = data_model_2[:, 0]
X_2 = data_model_2[:, 1:]

# Remove nan values from the data
y_2 = y_2[np.squeeze(~np.isnan(X_2).any(axis=1))]
X_2 = X_2[~np.isnan(X_2).any(axis=1)]

# Split the data in train and test
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X_2, y_2, test_size=0.30, random_state=0)


# Normalize 
train_mean_2 = np.mean(X_train_2)
train_std_2 = np.std(X_train_2)

X_train_norm_2 = (X_train_2 - train_mean_2)/train_std_2

# Apply gradient descent
w_2, b_2, J_history_2, w_history_2  = logistic_regression.gradient_descent_reg(X_train_norm_2, y_train_2, 
np.zeros((X_train_norm_2.shape[1],)),0, 3, 300000, 0.7e-6)

Iteration    0: Cost     0.71   
Iteration 30000: Cost     0.26   
Iteration 60000: Cost     0.23   
Iteration 90000: Cost     0.22   
Iteration 120000: Cost     0.22   
Iteration 150000: Cost     0.22   
Iteration 180000: Cost     0.22   
Iteration 210000: Cost     0.22   
Iteration 240000: Cost     0.22   
Iteration 270000: Cost     0.22   
Iteration 299999: Cost     0.22   


**Accuracy**

In [116]:
# Normalize the test data
X_test_norm_2 = (X_test_2 - train_mean_2) / train_std_2

# Compute the accurace
f_wb_2, g_2 = logistic_regression.predict(X_test_norm_2, w_2, b_2)
accuracy_2 = np.mean(np.where(f_wb_2 == y_test_2, 1, 0))
print('Accuracy after removing features:',round(accuracy_2, 3) * 100,'%')

Accuracy after removing features: 88.9 %


**Positive predictive value(PPV)** 

In [117]:
y_test_positive_2 = y_test_2[f_wb_2 == 1]
ppv_2 = np.mean(y_test_positive_2)
print('Positive predictive value:',round(ppv_2,3) * 100, '%')

Positive predictive value: 95.5 %


**Negative predictive value**

In [118]:
y_test_negative_2 = y_test_2[f_wb_2 == 0]
npv_2 = np.mean(1 - y_test_negative_2)
print('Negative predictive value:',round(npv_2, 3) * 100, '%')

Negative predictive value: 86.4 %


### Feature mapping

With help of the scikit learn library $mapping$ lets create polynomial features to fit data with nonlinear pattern.

Obs: The feature selection from the second model will be used once it produce better predictions with less data

In [134]:
# This code can take while to run 

# Split the data in train and test
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(
    X_2, y_2, test_size=0.30, random_state=0)

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_train_3 = poly.fit_transform(X_train_3)
X_test_3 = poly.fit_transform(X_test_3)

# Normalize
train_mean_3 = np.mean(X_train_3)
train_std_3 = np.std(X_train_3)

X_train_norm_3 = (X_train_3 - train_mean_3) / train_std_3 

# Apply gradient descent
w_3, b_3, J_history_3, w_history_3  = logistic_regression.gradient_descent_reg(X_train_norm_3, y_train_3, 
np.zeros((X_train_norm_3.shape[1],)),0, 5, 70000, 1e-6)

Iteration    0: Cost     0.62   
Iteration 7000: Cost     0.46   
Iteration 14000: Cost     0.40   
Iteration 21000: Cost     0.37   
Iteration 28000: Cost     0.35   
Iteration 35000: Cost     0.34   
Iteration 42000: Cost     0.34   
Iteration 49000: Cost     0.33   
Iteration 56000: Cost     0.33   
Iteration 63000: Cost     0.32   
Iteration 69999: Cost     0.32   


In [135]:
# Normalize the test data
X_test_norm_3 = (X_test_3 - train_mean_3) / train_std_3

# Compute the accurace
f_wb_3, g_3 = logistic_regression.predict(X_test_norm_3, w_3, b_3)
accuracy_3 = np.mean(np.where(f_wb_3 == y_test_3, 1, 0))
print('Accuracy after feature maping:',round(accuracy_3, 3) * 100,'%')

Accuracy after feature maping: 84.0 %


**Positive predictive value (PPV)** 

In [136]:
y_test_positive_3 = y_test_3[f_wb_3 == 1.0]
ppv_3 = np.mean(y_test_positive_3)
print('Positive predictive value:',round(ppv_3,3) * 100, '%')

Positive predictive value: 97.1 %


**Negative predictive value (NVP)**

In [137]:
y_test_negative_3 = y_test_3[f_wb_3 == 0]
npv_3 = np.mean(1 - y_test_negative_3)
print('Negative predictive value:',round(npv_3, 3) * 100, '%')

Negative predictive value: 80.5 %


# Conclusion

First model
* Accuracy = 84.0%
* PVP = 87.2%
* NVP = 82.6%

Second model
* Accuracy = 88.9%
* PVP = 95.5%
* NVP = 86.4%

Third model
* Accuracy = 84.0%
* PVP = 97.1%
* NVP = 80.5%