In [None]:
# PW_3
import pandas as pd
import sqlite3
import os

output_dir = "./data"
os.makedirs(output_dir, exist_ok=True)

url_election_data = "https://www.kaggle.com/api/v1/datasets/download/essarabi/ultimate-us-election-dataset"
url_air_quality_data = "https://www.kaggle.com/api/v1/datasets/download/sumandey/daily-air-quality-dataset-india"

def load_datasets():
    election_data = pd.read_csv(url_election_data, compression='zip')
    air_quality_data = pd.read_csv(url_air_quality_data, compression='zip')
    return election_data, air_quality_data

def preprocess_election_data(election_data):
    election_data = election_data[['county', 'state', 'Density per square km', 'Median income (dollars)']]
    election_data.columns = ['county', 'state', 'population_density', 'median_income']
    election_data.dropna(inplace=True)
    return election_data

def preprocess_air_quality_data(air_quality_data):
    air_quality_data = air_quality_data[['CITY', 'DATE', 'VALUE']]
    air_quality_data.columns = ['city', 'date', 'pm2.5']
    air_quality_data.dropna(inplace=True)
    air_quality_data['date'] = pd.to_datetime(air_quality_data['date'])
    air_quality_data = air_quality_data.groupby('city', as_index=False)['pm2.5'].mean()
    return air_quality_data

def merge_data(election_data, air_quality_data):
    election_data['county'] = election_data['county'].str.lower()
    air_quality_data['city'] = air_quality_data['city'].str.lower()
    merged_data = pd.merge(election_data, air_quality_data, left_on='county', right_on='city', how='inner')
    return merged_data

def save_data(merged_data):
    csv_path = os.path.join(output_dir, 'merged_population_density_air_quality.csv')
    merged_data.to_csv(csv_path, index=False)
    sqlite_path = os.path.join(output_dir, 'population_density_air_quality.db')
    conn = sqlite3.connect(sqlite_path)
    merged_data.to_sql('merged_data', conn, if_exists='replace', index=False)
    conn.close()

def main():
    election_data, air_quality_data = load_datasets()
    election_data = preprocess_election_data(election_data)
    air_quality_data = preprocess_air_quality_data(air_quality_data)
    merged_data = merge_data(election_data, air_quality_data)
    save_data(merged_data)
    print("Data pipeline executed successfully. Merged dataset saved in /data directory.")

if __name__ == "__main__":
    main()
#done:)

Data pipeline executed successfully. Merged dataset saved in /data directory.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  election_data.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  air_quality_data.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  air_quality_data['date'] = pd.to_datetime(air_quality_data['date'])
