# üìò PEW Research Dataset ‚Äî Full Data Cleaning, Exploration & Visualization Notebook

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("pew-raw.csv")
df

## üîç 2. Initial Exploration

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.loc[0:3]

In [None]:
df.iloc[0:3, 0:4]

## ‚ö†Ô∏è 3. Identify Issues
- Income bracket columns contain values, not variable names.
- Some column names contain leading/trailing spaces.
- Religion names also contain trailing spaces.
- Dataset is in wide format, not tidy.

## üßº 4. Clean Column Names

In [None]:
df.columns = df.columns.str.strip().str.replace(" ", "")
df['religion'] = df['religion'].str.strip()
df

## üîÑ 5. Reshape to Long Format Using `melt`

In [None]:
df_long = df.melt(
    id_vars="religion",
    var_name="income_bracket",
    value_name="count"
)
df_long

## üìä 6. Summary Statistics

In [None]:
df_long['count'].describe()

In [None]:
df_long.groupby('religion')['count'].sum().sort_values(ascending=False)

In [None]:
df_long.groupby('income_bracket')['count'].sum().sort_values(ascending=False)

## üé® 7. Visualizations

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(
    data=df_long.groupby('religion')['count'].sum().reset_index(),
    x='religion', y='count'
)
plt.xticks(rotation=45, ha='right')
plt.title("Total Respondents by Religion")
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.barplot(
    data=df_long,
    x='religion', y='count', hue='income_bracket'
)
plt.xticks(rotation=45, ha='right')
plt.title("Income Distribution by Religion")
plt.legend(title="Income Bracket")
plt.show()

In [None]:
pivot = df_long.pivot(index='religion', columns='income_bracket', values='count')

plt.figure(figsize=(12,6))
sns.heatmap(pivot, annot=True, fmt='d', cmap='Blues')
plt.title("Heatmap of Religion vs Income Bracket")
plt.show()

## üìÅ 8. Export the Tidy Dataset

In [None]:
df_long.to_csv("pew-tidy.csv", index=False)