## Title: "Predicting Comprehensive Knowledge about HIV/AIDS using a Machine Learning Approach"

Comprehensive Knowledge about HIV/AIDS is a composite outcome of the following:
1. can get hiv by sharing food with person who has aids (v754wp)
2. a healthy looking person can have hiv (v756)
3. reduce risk of getting hiv: always use condoms during sex (v754cp)
4. reduce risk of getting hiv: have 1 sex partner only, who has no other partners (v754dp)
5. can get hiv from mosquito bites (v754jp)

In [1]:
# Importing necessary libraries for data manipulation and visualization
import matplotlib.pyplot as plt  
import numpy as np  
import pandas as pd  
import seaborn as sns  

import pyreadstat  
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder 


In [2]:
data= pd.read_stata("D:\\AAU PHDS\\Semister Three\\Advanced Data Analytics\\Proposal\\DATASET\\women15_49.dta")

In [3]:
data

Unnamed: 0,caseid,v001,v002,v005,v012,v013,v021,v024,v025,v026,...,v847,v848,v856,v857a,v857b,v857c,v857d,v858,s1058,s1056
0,00010017 02,1,17,5087433,38,35-39,1,oromia,rural,,...,,,,no,yes,no,no,no,none,no
1,00010017 03,1,17,5087433,17,15-19,1,oromia,rural,,...,,,,no,yes,yes,no,yes,,
2,00010018 02,1,18,5087433,42,40-44,1,oromia,rural,,...,,,,no,no,no,no,no,none,no
3,00010025 02,1,25,5087433,46,45-49,1,oromia,rural,,...,,,,yes,no,yes,yes,no,none,no
4,00010025 10,1,25,5087433,17,15-19,1,oromia,rural,,...,,,,no,yes,no,no,yes,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15678,06450419 04,645,419,685395,17,15-19,645,addis adaba,urban,,...,,,,yes,yes,don't know/not sure/depends,don't know/not sure/depends,no,,
15679,06450449 02,645,449,685395,33,30-34,645,addis adaba,urban,,...,,,,yes,no,no,no,no,1.0,yes
15680,06450464 02,645,464,685395,25,25-29,645,addis adaba,urban,,...,,,,yes,no,no,no,no,none,yes
15681,06450464 04,645,464,685395,18,15-19,645,addis adaba,urban,,...,,,,yes,yes,no,no,no,,


In [4]:
#Import label names from STATA
hiv, meta = pyreadstat.read_dta("D:/AAU PHDS/Semister Three/Advanced Data Analytics/Proposal/DATASET/women15_49.dta")
meta_data = dict(zip(meta.column_names, meta.column_labels))
meta_data

{'caseid': 'case identification',
 'v001': 'cluster number',
 'v002': 'household number',
 'v005': "women's individual sample weight (6 decimals)",
 'v012': "respondent's current age",
 'v013': 'age in 5-year groups',
 'v021': 'primary sampling unit',
 'v024': 'region',
 'v025': 'type of place of residence',
 'v026': 'de facto place of residence',
 'v102': 'type of place of residence',
 'v103': 'childhood place of residence',
 'v104': 'years lived in place of residence',
 'v105': 'type of place of previous residence',
 'v105a': 'region of previous residence',
 'v106': 'highest educational level',
 'v107': 'highest year of education',
 'v120': 'household has: radio',
 'v121': 'household has: television',
 'v130': 'religion',
 'v131': 'ethnicity',
 'v151': 'sex of household head',
 'v152': 'age of household head',
 'v153': 'household has: telephone (land-line)',
 'v155': 'literacy',
 'v156': 'ever participated in a literacy program outside of primary',
 'v157': 'frequency of reading news

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15683 entries, 0 to 15682
Columns: 106 entries, caseid to s1056
dtypes: category(100), int16(3), int32(1), int8(1), object(1)
memory usage: 1.8+ MB


In [6]:
data

Unnamed: 0,caseid,v001,v002,v005,v012,v013,v021,v024,v025,v026,...,v847,v848,v856,v857a,v857b,v857c,v857d,v858,s1058,s1056
0,00010017 02,1,17,5087433,38,35-39,1,oromia,rural,,...,,,,no,yes,no,no,no,none,no
1,00010017 03,1,17,5087433,17,15-19,1,oromia,rural,,...,,,,no,yes,yes,no,yes,,
2,00010018 02,1,18,5087433,42,40-44,1,oromia,rural,,...,,,,no,no,no,no,no,none,no
3,00010025 02,1,25,5087433,46,45-49,1,oromia,rural,,...,,,,yes,no,yes,yes,no,none,no
4,00010025 10,1,25,5087433,17,15-19,1,oromia,rural,,...,,,,no,yes,no,no,yes,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15678,06450419 04,645,419,685395,17,15-19,645,addis adaba,urban,,...,,,,yes,yes,don't know/not sure/depends,don't know/not sure/depends,no,,
15679,06450449 02,645,449,685395,33,30-34,645,addis adaba,urban,,...,,,,yes,no,no,no,no,1.0,yes
15680,06450464 02,645,464,685395,25,25-29,645,addis adaba,urban,,...,,,,yes,no,no,no,no,none,yes
15681,06450464 04,645,464,685395,18,15-19,645,addis adaba,urban,,...,,,,yes,yes,no,no,no,,


In [7]:
data.describe()

Unnamed: 0,v001,v002,v005,v012,v021
count,15683.0,15683.0,15683.0,15683.0,15683.0
mean,323.643244,242.769049,1000000.0,27.937321,323.643244
std,186.350386,141.050908,1110347.0,9.159282,186.350386
min,1.0,0.0,5972.0,15.0,1.0
25%,161.0,121.0,115681.0,20.0,161.0
50%,323.0,242.0,512367.0,27.0,323.0
75%,486.0,366.0,1836606.0,35.0,486.0
max,645.0,486.0,5781192.0,49.0,645.0


In [8]:
# Count the number of null values per column
null_counts = data.isnull().sum()
null_counts

caseid       0
v001         0
v002         0
v005         0
v012         0
          ... 
v857c     1314
v857d     1314
v858         0
s1058     7125
s1056     5859
Length: 106, dtype: int64

In [9]:
## remove columns from the DataFrame where every entry in the column is NaN.
df = data.dropna(axis=1, how='all')

# Display the cleaned DataFrame
print("\nDataFrame after dropping columns with all NaN values:")
df


DataFrame after dropping columns with all NaN values:


Unnamed: 0,caseid,v001,v002,v005,v012,v013,v021,v024,v025,v102,...,v841a,v842,v843,v857a,v857b,v857c,v857d,v858,s1058,s1056
0,00010017 02,1,17,5087433,38,35-39,1,oromia,rural,rural,...,,,,no,yes,no,no,no,none,no
1,00010017 03,1,17,5087433,17,15-19,1,oromia,rural,rural,...,,,,no,yes,yes,no,yes,,
2,00010018 02,1,18,5087433,42,40-44,1,oromia,rural,rural,...,,,,no,no,no,no,no,none,no
3,00010025 02,1,25,5087433,46,45-49,1,oromia,rural,rural,...,,,,yes,no,yes,yes,no,none,no
4,00010025 10,1,25,5087433,17,15-19,1,oromia,rural,rural,...,,,,no,yes,no,no,yes,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15678,06450419 04,645,419,685395,17,15-19,645,addis adaba,urban,urban,...,,,,yes,yes,don't know/not sure/depends,don't know/not sure/depends,no,,
15679,06450449 02,645,449,685395,33,30-34,645,addis adaba,urban,urban,...,,government health center,no,yes,no,no,no,no,1.0,yes
15680,06450464 02,645,464,685395,25,25-29,645,addis adaba,urban,urban,...,,government health center,no,yes,no,no,no,no,none,yes
15681,06450464 04,645,464,685395,18,15-19,645,addis adaba,urban,urban,...,,,,yes,yes,no,no,no,,


In [10]:
# Remove columns with >90% missing values
columns_to_drop = df.columns[df.isnull().mean() > 0.9]
data_cleaned = df.drop(columns=columns_to_drop)

# Remove columns where all entries are NaN
data_cleaned = data_cleaned.dropna(axis=1, how='all')

data_cleaned

Unnamed: 0,caseid,v001,v002,v005,v012,v013,v021,v024,v025,v102,...,v841,v842,v843,v857a,v857b,v857c,v857d,v858,s1058,s1056
0,00010017 02,1,17,5087433,38,35-39,1,oromia,rural,rural,...,,,,no,yes,no,no,no,none,no
1,00010017 03,1,17,5087433,17,15-19,1,oromia,rural,rural,...,,,,no,yes,yes,no,yes,,
2,00010018 02,1,18,5087433,42,40-44,1,oromia,rural,rural,...,,,,no,no,no,no,no,none,no
3,00010025 02,1,25,5087433,46,45-49,1,oromia,rural,rural,...,,,,yes,no,yes,yes,no,none,no
4,00010025 10,1,25,5087433,17,15-19,1,oromia,rural,rural,...,,,,no,yes,no,no,yes,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15678,06450419 04,645,419,685395,17,15-19,645,addis adaba,urban,urban,...,,,,yes,yes,don't know/not sure/depends,don't know/not sure/depends,no,,
15679,06450449 02,645,449,685395,33,30-34,645,addis adaba,urban,urban,...,yes,government health center,no,yes,no,no,no,no,1.0,yes
15680,06450464 02,645,464,685395,25,25-29,645,addis adaba,urban,urban,...,yes,government health center,no,yes,no,no,no,no,none,yes
15681,06450464 04,645,464,685395,18,15-19,645,addis adaba,urban,urban,...,,,,yes,yes,no,no,no,,


In [11]:
# Rename columns with variable names 
#df.columns = [meta_data.get(col, col) for col in df.columns]

#df

In [12]:
#df.columns