In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the two datasets from your notebook
print("Loading datasets...")

# 1. Tax delinquency data
tax_df = pd.read_csv('taxdel.tsv', sep='\t', encoding='utf-8-sig')

# 2. Poverty data (from the geojson you were using)
poverty_df = pd.read_csv('poverty_data.csv')

print(f"Tax Delinquencies: {len(tax_df)} records")
print(f"Neighborhoods: {len(poverty_df)} records")

# Count tax delinquencies by neighborhood
tax_with_neighborhood = tax_df[tax_df['neighborhood'].notna() & (tax_df['neighborhood'] != '')]
tax_counts = tax_with_neighborhood['neighborhood'].value_counts()

# Combine both datasets
analysis_data = []

for idx, row in poverty_df.iterrows():
    neighborhood = row['hood']
    pct_poverty = row['pct_under_']
    tax_count = tax_counts.get(neighborhood, 0)
    
    analysis_data.append({
        'neighborhood': neighborhood,
        'tax_delinquencies': tax_count,
        'pct_poverty': pct_poverty
    })

analysis_df = pd.DataFrame(analysis_data)

# Calculate composite score (lower is better)
max_tax = analysis_df['tax_delinquencies'].max()
max_poverty = analysis_df['pct_poverty'].max()

analysis_df['tax_score'] = (analysis_df['tax_delinquencies'] / max_tax) * 100
analysis_df['poverty_score'] = (analysis_df['pct_poverty'] / max_poverty) * 100
analysis_df['composite_score'] = (analysis_df['tax_score'] + analysis_df['poverty_score']) / 2

# Sort by composite score
analysis_df_sorted = analysis_df.sort_values('composite_score')

print("\n" + "="*70)
print("TOP 10 BEST NEIGHBORHOODS")
print("="*70)
top_10 = analysis_df_sorted.head(10)
for idx, row in top_10.iterrows():
    print(f"{row['neighborhood']:25} | Tax Delinq: {int(row['tax_delinquencies']):4} | Poverty: {row['pct_poverty']:.1f}% | Score: {row['composite_score']:.2f}")

best = analysis_df_sorted.iloc[0]
print("\n" + "="*70)
print(f"BEST NEIGHBORHOOD: {best['neighborhood']}")
print("="*70)
print(f"Tax Delinquencies: {int(best['tax_delinquencies'])}")
print(f"Poverty Rate: {best['pct_poverty']:.1f}%")
print(f"Composite Score: {best['composite_score']:.2f}")

# Visualization: Scatter plot
plt.figure(figsize=(12, 8))
plt.scatter(analysis_df['pct_poverty'], analysis_df['tax_delinquencies'], 
           alpha=0.6, s=100, c=analysis_df['composite_score'], cmap='RdYlGn_r')
plt.colorbar(label='Composite Score (Lower is Better)')
plt.xlabel('Poverty Rate (%)', fontsize=12)
plt.ylabel('Number of Tax Delinquent Properties', fontsize=12)
plt.title('Pittsburgh Neighborhoods: Tax Delinquency vs Poverty Rate', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)

# Annotate best neighborhood
plt.annotate(best['neighborhood'], 
            xy=(best['pct_poverty'], best['tax_delinquencies']),
            xytext=(10, 10), textcoords='offset points', 
            bbox=dict(boxstyle='round,pad=0.5', fc='green', alpha=0.5),
            fontsize=10, fontweight='bold')
plt.tight_layout()
plt.show()

# Bar chart: Top 10 Best vs Worst
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

best_10 = analysis_df_sorted.head(10)
ax1.barh(range(len(best_10)), best_10['composite_score'], color='green', alpha=0.7)
ax1.set_yticks(range(len(best_10)))
ax1.set_yticklabels(best_10['neighborhood'])
ax1.set_xlabel('Composite Score', fontsize=11)
ax1.set_title('Top 10 BEST Neighborhoods', fontsize=13, fontweight='bold')
ax1.invert_yaxis()

worst_10 = analysis_df_sorted.tail(10)
ax2.barh(range(len(worst_10)), worst_10['composite_score'], color='red', alpha=0.7)
ax2.set_yticks(range(len(worst_10)))
ax2.set_yticklabels(worst_10['neighborhood'])
ax2.set_xlabel('Composite Score', fontsize=11)
ax2.set_title('Top 10 WORST Neighborhoods', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

Loading datasets...


FileNotFoundError: [Errno 2] No such file or directory: 'poverty_data.csv'