# Data Cleaning Process

## Step 1: Importing Libraries

In [1]:

# Importing necessary libraries
import pandas as pd
    

## Step 2: Loading the Dataset

In [2]:

# Loading the dataset
data = pd.read_csv("Data_set.csv")
print("Initial Dataset Loaded:")
print(data.info())
    

Initial Dataset Loaded:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age                          1310 non-null   float64
 1   Gender                       1279 non-null   object 
 2   Ethnicity                    1306 non-null   object 
 3   Region                       1297 non-null   object 
 4   Socioeconomic Status         1295 non-null   object 
 5   Symptom Description          1295 non-null   object 
 6   Symptom Severity             1289 non-null   float64
 7   Duration of Symptoms (days)  1298 non-null   float64
 8   Additional Symptoms          1293 non-null   object 
 9   Chronic Condition            952 non-null    object 
 10  Allergies                    1023 non-null   object 
 11  Previous Visits              1301 non-null   float64
 12  Potential Condition          1308 non-null   object 

## Step 3: Handling Missing Values

In [3]:

# Handling missing values for numerical columns by filling with the mean
numerical_columns = ['Age', 'Symptom Severity', 'Duration of Symptoms (days)']
for column in numerical_columns:
    data[column] = data[column].fillna(data[column].mean())

# Handling missing values for categorical columns by filling with the mode
categorical_columns = ['Gender', 'Region', 'Ethnicity', 'Socioeconomic Status', 
                       'Symptom Description', 'Additional Symptoms', 'Chronic Condition', 
                       'Allergies', 'Potential Condition', 'Suggested Action', 'Insurance Status']
for column in categorical_columns:
    data[column] = data[column].fillna(data[column].mode()[0])
    

## Step 4: Standardising Text Columns

In [4]:

# Standardising text columns to lowercase for consistency
text_columns = ['Symptom Description', 'Additional Symptoms', 'Potential Condition', 'Suggested Action']
data[text_columns] = data[text_columns].apply(lambda x: x.str.lower())
    

## Step 5: Removing Unnecessary Columns

In [5]:

if 'Previous Visits' in data.columns:
    data = data.drop(columns=['Previous Visits'])
print("Removed 'Previous Visits' column.")
    

Removed 'Previous Visits' column.


## Step 6: Removing Duplicate Entries

In [6]:

# Removing duplicate rows
data = data.drop_duplicates()
    

## Step 7: Verifying Data Integrity

In [7]:

# Verifying that the dataset is clean and consistent
print("Dataset After Cleaning:")
print(data.info())
    

Dataset After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age                          1500 non-null   float64
 1   Gender                       1500 non-null   object 
 2   Ethnicity                    1500 non-null   object 
 3   Region                       1500 non-null   object 
 4   Socioeconomic Status         1500 non-null   object 
 5   Symptom Description          1500 non-null   object 
 6   Symptom Severity             1500 non-null   float64
 7   Duration of Symptoms (days)  1500 non-null   float64
 8   Additional Symptoms          1500 non-null   object 
 9   Chronic Condition            1500 non-null   object 
 10  Allergies                    1500 non-null   object 
 11  Potential Condition          1500 non-null   object 
 12  Suggested Action             1500 non-null   object 

## Step 8: Exporting the Cleaned Dataset

In [8]:

# Exporting the cleaned dataset
data.to_csv("Cleaned_Data_set_Final.csv", index=False)
print("Cleaned dataset saved as 'Cleaned_data_set.csv'.")
    

Cleaned dataset saved as 'Cleaned_Data_set_Final.csv'.
