In [None]:
import numpy as npy
import pandas as pd
import seaborn as sborn
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('demographics.csv')
print(data.head())


In [None]:
# identifying and removing bogus blood type

blood_types_list = ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']
blood_type = pd.DataFrame({'blood_type': blood_types_list})

unique_blood_types_set = set(data['blood_type'])
valid_blood_types_set = set(blood_type['blood_type'])

bogus_blood_type = unique_blood_types_set.difference(valid_blood_types_set)
print("Bogus blood types: ", bogus_blood_type)

bogus_recored_index = data[data['blood_type'].isin(bogus_blood_type)].index
data_cleaned = data.drop(bogus_recored_index)

print(data_cleaned['blood_type'].unique())

In [None]:
# Handling Inconsistent Marriage Status Categories

data['marriage_status'].unique()
inconsistent_data = data.copy()
inconsistent_data['marriage_status'] = inconsistent_data['marriage_status'].str.lower()

inconsistent_data['marriage_status'].unique()

In [None]:
# Grouping Income into Meaningful Bins

print(f"{data['income'].min()}, {data['income'].max()}")
income_bins = [40000, 75000, 100000, 125000, 150000, npy.inf]
income_labels = ['40-75K', '75-100K', '100-125K', '125-150K', '150K+']

remapping_data = data.copy()
remapping_data['income_groups'] = pd.cut(
  remapping_data['income'],
  bins=income_bins,
  labels=income_labels,
)

remapping_data.head()

In [None]:
# Visualizing Income Group Distribution

remapping_data['income_groups'].value_counts().sort_index().plot.bar()
plt.title('Income Group Distribution')
plt.xlabel('Income Groups')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


In [None]:
# cleaning phone number data

import random
phone_numbers = []

for i in range(100):
  number = random.randint(100000000, 9999999999)  # length can be 9 or 10 digits
  if i % 2 == 0:
    phone_numbers.append('+91 ' + str(number))
  else:
    phone_numbers.append(str(number))

phone_numbers_data = pd.DataFrame({
  'phone_numbers': phone_numbers
})

phone_numbers_data.head()

Unnamed: 0,phone_numbers
0,+91 1639019053
1,2962671507
2,+91 8250389101
3,2065513220
4,+91 209719993
