Comparative Analysis for prediction of STI

Importing data

In [None]:
import pandas as pd
import numpy as np
data = pd.read_excel('/content/STIData (1).xls')
data.head()

Unnamed: 0,IdNumber,CaseStatus,Date,A1Age,A2Occupation,A3Church,A4LevelOfEducation,A5MaritalStatus,Weight,Height,...,Unemployed,Education,AlcoholUse,SexPartner1year,SexPartner3month,LastPartnerSpouse,Belong,ReceiveHelp,SexPartnerLife3,Sex.1
0,32,2,2009-12-03,23,1 unemployed,5 pentecostal,3 secondary,2 married,61.0,182.0,...,1.0,2,2.0,0,0,1,1,1,0,Female
1,33,1,2009-12-03,24,4 student,2 apostolic,3 secondary,2 married,53.0,166.0,...,,2,1.0,0,0,1,0,0,0,Female
2,34,2,2009-12-03,24,1 unemployed,2 apostolic,3 secondary,2 married,91.4,166.0,...,1.0,2,1.0,0,0,1,1,1,0,Female
3,35,1,2009-12-03,33,3 formal,7 roman catholic,3 secondary,2 married,100.0,166.0,...,2.0,2,2.0,0,0,1,1,1,0,Female
4,10,2,2009-12-03,63,4 student,8 other,2 primary,2 married,83.0,156.0,...,,1,2.0,0,0,1,1,1,0,Male


Checking the missing values

In [None]:
data.isnull().sum()

Unnamed: 0,0
IdNumber,0
CaseStatus,0
Date,0
A1Age,0
A2Occupation,0
A3Church,0
A4LevelOfEducation,0
A5MaritalStatus,0
Weight,0
Height,1


Imputing data

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

Selecting numerical columns for imputation

In [None]:
# Getting numerical columns
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()
# Select only numerical columns
data_num = data[numerical_columns]

Applying IterativeImputer to the numerical dataframe

In [None]:
imputer = IterativeImputer(max_iter=10, random_state=0,imputation_order='roman' )
data_imputed = imputer.fit_transform(data_num)

 Creating new dataframe with imputed values and original column names

In [None]:
data1= pd.DataFrame(data_imputed, columns=data_num.columns)

Merging imputed dataframe with the original non-numerical columns

In [None]:
data_final = pd.concat([data.drop(numerical_columns, axis=1), data1], axis=1)

In [None]:
## No missing values
data1.isnull().sum()

Unnamed: 0,0
IdNumber,0
CaseStatus,0
A1Age,0
Weight,0
Height,0
C3StiYesno,0
D1BurialSociety,0
D1religiousgrp,0
D1savingsClub,0
D1tradersAssoc,0


In [None]:
# Checking for Duplicates and dropping them
print("Checking for duplicates...")
duplicate_count = data1.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    data1 = data1.drop_duplicates()
    print(f"Dropped {duplicate_count} duplicate rows. New shape: {data1.shape}")
else:
    print("No duplicates found.")

Checking for duplicates...
Number of duplicate rows: 0
No duplicates found.


In [None]:
# Checking for inconsistencies and correcting
print("Checking for inconsistencies...")
for column in data1.columns:
    unique_values = data1[column].unique()
    print(f"Column: {column}")
    print(f"Unique values: {unique_values}")

Checking for inconsistencies...
Column: IdNumber
Unique values: [ 32.  33.  34.  35.  10.  11.  12.  13.  14.  15.  16.  17.  18.  19.
   6.   7.   8.   9.  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.
  30.  31.  36.  37.  38.  39.  40.  41.  42.  43.  44.  45.  46.  47.
  48.  49.  50.  51.  52.  53.  54.  55.  85.  86.  87.  88.  89.  90.
  91.  92.  93.  94.  95.  96.  97.  98.  99. 100. 101. 102. 103. 104.
 105. 106. 107. 108. 109. 110. 111. 112. 113. 114. 115. 116. 117. 118.
 119. 120. 121. 122. 123. 124. 125. 126. 127. 128. 129. 130. 131. 132.
 133. 134. 135. 136. 137. 138. 139. 140. 141. 142. 143. 144. 145. 146.
 147. 148. 149. 150. 151. 152. 153. 154. 155. 156. 157. 158. 159. 160.
 161. 162.   1.   2.   3.   4.   5.  56.  57.  58.  59.  60.  61.  62.
  63.  64.  65.  66.  67.  68.  69.  70.  71.  72.  73.  74.  75.  76.
  77.  78.  79.  80.  81.  82.  83.  84. 163. 164. 165. 166. 167. 168.
 169. 170. 171. 172. 173. 174. 175. 176. 177. 178. 179. 180. 181. 182.
 183. 184. 18

In [None]:
#Correcting inconsistency
data1['CaseStatus'] = data1['CaseStatus'].replace(3, 2)
print("\nCorrected CaseStatus = 3 to CaseStatus = 2")
print(f"Unique values in CaseStatus after correction: {data1['CaseStatus'].unique()}")
print("\nValue counts in CaseStatus after correction:")
print(data1['CaseStatus'].value_counts(dropna=False))


Corrected CaseStatus = 3 to CaseStatus = 2
Unique values in CaseStatus after correction: [2. 1.]

Value counts in CaseStatus after correction:
CaseStatus
2.0    115
1.0    112
Name: count, dtype: int64


In [None]:
#Converting CaseStatus from 1 and 2 to binary
if set(data1['CaseStatus'].dropna().unique()).issubset({1, 2}):
    print(f"\nUnique values in CaseStatus before binary conversion: {data1['CaseStatus'].unique()}")
    data1['CaseStatus'] = data1['CaseStatus'].map({1: 1, 2: 0})
    print(f"Converted CaseStatus from [1, 2] to [0, 1]")
    print(f"Unique values in CaseStatus after binary conversion: {data1['CaseStatus'].unique()}")
    print("\nValue counts in CaseStatus after binary conversion:")
    print(data1['CaseStatus'].value_counts(dropna=False))
else:
    print(f"\nCaseStatus still contains unexpected values: {data1['CaseStatus'].unique()}")


Unique values in CaseStatus before binary conversion: [2. 1.]
Converted CaseStatus from [1, 2] to [0, 1]
Unique values in CaseStatus after binary conversion: [0 1]

Value counts in CaseStatus after binary conversion:
CaseStatus
0    115
1    112
Name: count, dtype: int64


In [None]:
print(data1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   IdNumber             227 non-null    float64
 1   CaseStatus           227 non-null    int64  
 2   A1Age                227 non-null    float64
 3   Weight               227 non-null    float64
 4   Height               227 non-null    float64
 5   C3StiYesno           227 non-null    float64
 6   D1BurialSociety      227 non-null    float64
 7   D1religiousgrp       227 non-null    float64
 8   D1savingsClub        227 non-null    float64
 9   D1tradersAssoc       227 non-null    float64
 10  D3Education          227 non-null    float64
 11  D3FuneralAssistance  227 non-null    float64
 12  D3HealthServices     227 non-null    float64
 13  DurationOfillness    227 non-null    float64
 14  N14DoYouHave         227 non-null    float64
 15  N15LivingTogether    227 non-null    flo

Stepwise logistic regression for variable selection

In [None]:
## Defining Independent variables
IVs = [ "A1Age","C3StiYesno","D1BurialSociety","D1religiousgrp","D1savingsClub","D1tradersAssoc","D3Education",
       "D3FuneralAssistance","D3HealthServices","DurationOfillness","N14DoYouHave","N15LivingTogether",
        "D3receivecredit","N3HadAnSti","HabitationStatus","Unemployed","Education","AlcoholUse","SexPartner1year","SexPartner3month",
        "LastPartnerSpouse","Belong","ReceiveHelp","SexPartnerLife3"]
#Defining Independent and dependent variables
x = data1[IVs]
y = data1['CaseStatus']
## Adding a constant for intercept in logistic regression
import statsmodels.api as sm
x=sm.add_constant(x)
### Stepwise Regression
def stepwise_logistic_regression(X, y, threshold_in=0.05, threshold_out=0.10, verbose=True):

##Performing stepwise selection for logistic regression.
    selected_features = []
    remaining_features = list(X.columns)
    changed = True
    while changed:
        changed = False
        # Forward Step
        new_pvalues = pd.Series(dtype=float)
        for feature in remaining_features:
            model = sm.Logit(y, X[selected_features + [feature]]).fit(disp=False)
            new_pvalues[feature] = model.pvalues[feature]

        if not new_pvalues.empty:
            best_feature = new_pvalues.idxmin()
            if new_pvalues[best_feature] < threshold_in:
                selected_features.append(best_feature)
                remaining_features.remove(best_feature)
                changed = True
                if verbose:
                    print(f"Added {best_feature} (p-value: {new_pvalues[best_feature]:.4f})")

        #Backward step
        if len(selected_features) > 1:
            model = sm.Logit(y, X[selected_features]).fit(disp=False)
            pvalues = model.pvalues.iloc[1:]  # Exclude intercept
            worst_feature = pvalues.idxmax()

            if pvalues[worst_feature] > threshold_out:
                selected_features.remove(worst_feature)
                remaining_features.append(worst_feature)
                changed = True
                if verbose:
                    print(f"Removed {worst_feature} (p-value: {pvalues[worst_feature]:.4f})")

    return selected_features


selected_features = stepwise_logistic_regression(x, y)
print("\nFinal Selected Features:", selected_features)

final_model = sm.Logit(y, x[selected_features]).fit()
print(final_model.summary())

Added SexPartner1year (p-value: 0.0001)
Added N3HadAnSti (p-value: 0.0006)
Added N15LivingTogether (p-value: 0.0000)
Added D3FuneralAssistance (p-value: 0.0004)
Added Belong (p-value: 0.0123)
Added SexPartnerLife3 (p-value: 0.0281)

Final Selected Features: ['SexPartner1year', 'N3HadAnSti', 'N15LivingTogether', 'D3FuneralAssistance', 'Belong', 'SexPartnerLife3']
Optimization terminated successfully.
         Current function value: 0.519734
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             CaseStatus   No. Observations:                  227
Model:                          Logit   Df Residuals:                      221
Method:                           MLE   Df Model:                            5
Date:                Sun, 03 Aug 2025   Pseudo R-squ.:                  0.2501
Time:                        17:08:25   Log-Likelihood:                -117.98
converged:                       True   LL-Null:           

Comparing Models

In [None]:
x[selected_features].isnull().sum()

Unnamed: 0,0
SexPartner1year,0
N3HadAnSti,0
N15LivingTogether,0
D3FuneralAssistance,0
Belong,0
SexPartnerLife3,0


In [None]:
# Seperating the data into dependent and independent
x = x[selected_features].values
y = data1['CaseStatus'].values

In [None]:
import pandas as pd
# Import libraries and classes required
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Splitting data into Training and Teting

In [None]:
#splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

Standardizing the data

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

# summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('accuracy is', accuracy_score(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.62      0.86      0.72        21
           1       0.82      0.56      0.67        25

    accuracy                           0.70        46
   macro avg       0.72      0.71      0.69        46
weighted avg       0.73      0.70      0.69        46

[[18  3]
 [11 14]]
accuracy is 0.6956521739130435


Accuracy Score

In [None]:
from sklearn.metrics import accuracy_score
print('accuracy is', (accuracy_score(y_pred,y_test)) * 100)

accuracy is 69.56521739130434


Support Vector Machine

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

# summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('accuracy is', (accuracy_score(y_pred,y_test)) * 100)

              precision    recall  f1-score   support

           0       0.62      0.86      0.72        21
           1       0.82      0.56      0.67        25

    accuracy                           0.70        46
   macro avg       0.72      0.71      0.69        46
weighted avg       0.73      0.70      0.69        46

[[18  3]
 [11 14]]
accuracy is 69.56521739130434


Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('accuracy is', (accuracy_score(y_pred,y_test)) * 100)

              precision    recall  f1-score   support

           0       0.64      0.86      0.73        21
           1       0.83      0.60      0.70        25

    accuracy                           0.72        46
   macro avg       0.74      0.73      0.72        46
weighted avg       0.75      0.72      0.71        46

[[18  3]
 [10 15]]
accuracy is 71.73913043478261


Bernoulli Naive Bayes is the best model with 71.74% accuracy compared with other models which have 69.57% accuracy

K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('accuracy is', (accuracy_score(y_pred,y_test)) * 100)

              precision    recall  f1-score   support

           0       0.67      0.67      0.67        21
           1       0.72      0.72      0.72        25

    accuracy                           0.70        46
   macro avg       0.69      0.69      0.69        46
weighted avg       0.70      0.70      0.70        46

[[14  7]
 [ 7 18]]
accuracy is 69.56521739130434
