#### Adding imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2

#### variables

In [None]:
excludedColumns = ['education_id', 'occupation_id','marital_id']
excludedColumnsMode = ['capital-gain','capital-loss']

#### Read the adult.data file and add column names to that based on adult.names

In [None]:
df = pd.read_csv('./data/adult.data', header=None, sep=",")
df.columns = ["age", "workclass", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship", "race", "sex","capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
df.to_csv('./csv/temp_data.csv', index=False)

#### Drop the fnlwgt column

In [None]:
df = df.drop(columns=["fnlwgt"])

####  Normalize education and education-num and convert to csv format

In [None]:
education_map = df[["education", "education-num"]].drop_duplicates().reset_index(drop=True)
print(education_map, type(education_map))
education_map["education_id"] = education_map.index + 1
education_map.to_csv("./csv/education.csv", index=False)

### Merge the education_id to final_data.csv

In [None]:
df = df.merge(education_map, on=["education", "education-num"], how="left")
df = df.drop(columns=["education", "education-num"])
print(df.head())

####  Normalize marital status and convert into the csv file

In [None]:
martial_map = pd.DataFrame(df["marital-status"].drop_duplicates().reset_index(drop=True))
#print(martial_map, type(martial_map))
martial_map["marital_id"] = martial_map.index + 1
#print(martial_map)
martial_map.to_csv("./csv/marital_status.csv", index=False)

#### Merge the marital_id to the final_data.csv

In [None]:
df = df.merge(martial_map,on=["marital-status"], how="left")
df = df.drop(columns=["marital-status"])

#### Normalize occupation

In [None]:
occupation_map = pd.DataFrame(df["occupation"].drop_duplicates().reset_index(drop=True))
occupation_map["occupation_id"] = occupation_map.index + 1
occupation_map.to_csv("./csv/occupation.csv", index=False)

#### Merge the occupation_id to the final_data.csv

In [None]:
df = df.merge(occupation_map, on=["occupation"], how="left")
df = df.drop(columns=["occupation"])

#### Re-ordering income as the last column

In [None]:
income_column = df.pop("income")
df["income"] = income_column

#### Check if the dataset has missing values
    -  if 0, then there is no missing values

In [None]:
print(df.isnull().sum())

#### Check if the dataset contains ? values
    - if there, convert to nan

In [None]:

print(df.map(lambda x: str(x).strip() == '?').sum())

df = df.map(lambda x: 'NaN' if str(x).strip() == "?" else x)

#### Check if the dataset contains ' ' values
- if there, convert to nan

In [None]:
print(df.map(lambda x: str(x).strip() == ' ').sum())
df = df.map(lambda x: "NaN" if str(x).strip() == ' ' else x)

#### Convert the cleaned data into csv 

In [None]:
df.to_csv('./csv/final_data.csv', index=False)
print(df.head())

In [None]:
#Calculate the mean, median and mode
print("Mean:\n", df.drop(columns=excludedColumns).mean(numeric_only=True))
print("Medians:\n", df.drop(columns=excludedColumns).median(numeric_only=True))
print("Mode:\n", df.drop(columns=excludedColumnsMode).mode(numeric_only=True))

# Calculate the variance
print("Variance:\n", df.var(numeric_only=True))

# Calculate the Standard Deviation
print("\nStandard Deviation:\n", df.std(numeric_only=True))

# Quartiles
print("Quartiles:\n", df.quantile([0.25, 0.5, 0.75], numeric_only=True))

In [None]:
# Scatter plot: Age vs Hours-per-week
plt.figure(figsize=(6,4))
sns.scatterplot(x='age', y='hours-per-week', hue='income', data=df)
plt.title('Age vs Hours-per-week')
plt.show()

In [None]:
# Histogram: Age distribution
plt.figure(figsize=(6,4))
df['age'].hist(bins=15)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Box-Whisker plot: Capital-gain
plt.figure(figsize=(6,4))
sns.boxplot(x='income', y='capital-gain', data=df)
plt.title('Capital Gain Distribution by Income Group')
plt.show()

In [None]:
# Example: Set of people with capital gain > 0 vs hours-per-week > 40
set1 = set(df[df['capital-gain'] > 0].index)
set2 = set(df[df['hours-per-week'] > 40].index)

plt.figure(figsize=(6,4))
venn2([set1, set2], set_labels=('Capital Gain > 0', 'Hours/Week > 40'))
plt.title('Venn Diagram Example')
plt.show()

In [None]:
# Basic numerical summary
print(df.describe(include='all'))
