In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
import re

import openpyxl

#from dotenv import load_dotenv
import os


# Precip Data

### Scraping data 

In [6]:
url = "https://www.ncei.noaa.gov/pub/data/cirs/drd/drd964x.pdsi.txt"


#picking names for files to save. both a txt and a csv
txt_name = "rain.txt"
csv_name = "rain_dirty.csv"

# colspecs and cols is formatting specific to this file
#the first col of the txt contains several pieces of information in a string
colspecs = [
    (0, 2), (2, 4), (4, 6), (6, 10),
    (10, 17), (17, 24), (24, 31), (31, 38),
    (38, 45), (45, 52), (52, 59), (59, 66),
    (66, 73), (73, 80), (80, 87), (87, 94)
]

cols = [
    "state", "division", "element", "year",
    "jan","feb","mar","apr","may","jun","jul","aug",
    "sep","oct","nov","dec"
]


# takes input that is requests.get()
# saves txt file from it. no return value
def write_txt(r):
    with open(txt_name, "wb") as f:
        f.write(r.content)


# no input, reads txt file, converts strings to numeric values, writes csv with numeric values
def txt_to_csv():
    df = pd.read_fwf(txt_name, colspecs=colspecs, names=cols)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.to_csv(csv_name, index=False)


# takes a url as an input
#checks status code = 200 to read it
#takes data from website and writes to a txt file and a csv
# MUST BE A TXT FILE URL!!!
def read_url_txt(url):
    # if url[-4] != ".txt":
    #     print("url must be a txt")
    r = requests.get(url)
    if r.status_code != 200:
        print(f"url status code is {r.status_code} not 200. Please check your url")
        return
    write_txt(r)
    txt_to_csv()

read_url_txt(url)

### Cleaning data
Make a new csv that contains normalized precip data by state

In [None]:
df = pd.read_csv("rain_dirty.csv")
df = df.drop(["division", "element"], axis = 1)

months = ["jan","feb","mar",'apr',"may","jun","jul","aug","sep","oct","nov","dec"]

# takes are precip data, combines the months, and divisions to get average rain per state per year. After I normalize it
def normalized_data(df):
    #column with yearly average, combines months
    df["yearly_avg"] = df[months].mean(axis=1)
    
    # combines averages across divisions of the state. now average per state
    #then normalizes values across states and years (x-mean)/std
    state_precip = df.groupby(["state","year"])["yearly_avg"].mean().reset_index(name = "avg_precip")
    state_precip["stand_precip"] = (state_precip["avg_precip"]-state_precip["avg_precip"].mean())/state_precip["avg_precip"].std()

    #writes df to csv
    state_precip.to_csv("rain_clean.csv")


# Farm data

In [None]:
farm_data = pd.read_excel("VA_State_US.xlsx")
farm_data.head()

In [18]:
excel_path = "VA_State_US.xlsx"

# Read all sheets
all_sheets = pd.read_excel(excel_path, sheet_name=None)

all_state = pd.DataFrame()
# `all_sheets` is a dictionary: {sheet_name: DataFrame}



In [46]:
for name, df in all_sheets.items():
    # print(f"Sheet: {name}")
    # print(df.head())
    if name != "VA_State_US":
        df["state"] = name
        df.to_csv(f"state_farm_data/{name}.csv")
        all_state = pd.concat([all_state,df])

In [48]:
all_state.head()
all_state.to_csv("all_state.csv")

In [34]:

combined_df = pd.concat(all_sheets.values(), ignore_index=True)
print(combined_df.head())
combined_df.to_csv("test.csv")


   VA_State_US     Unnamed: 1  \
0          NaN  United States   
1          NaN        Alabama   
2          NaN         Alaska   
3          NaN        Arizona   
4          NaN       Arkansas   

  Value added to the U.S. economy by the agricultural sector, 1910-2025F\nNominal (current dollars)\n  \
0                                                NaN                                                    
1                                                NaN                                                    
2                                                NaN                                                    
3                                                NaN                                                    
4                                                NaN                                                    

  Unnamed: 2 Unnamed: 3  Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7  \
0        NaN        NaN         NaN        NaN        NaN        NaN   
1        NaN        NaN   