In [1]:
import pandas as pd
import urllib
from datetime import datetime


PATH = r'/Users/uuriyzhuribeda/Documents/ds/AD1'

In [2]:
def get_data(n):
    now = datetime.now()
    dt = now.strftime("%d-%m-%Y_%H-%M")
    filename = 'vhi_id_{}_{}.csv'.format(n, dt)

    url='https://www.star.nesdis.noaa.gov/smcd/emb/vci/VH/get_TS_admin.php?country=UKR&provinceID={}&year1=1981&year2=2021&type=Mean'.format(n)

    vhi_url = urllib.request.urlopen(url)
    
    out=open(filename , 'wb')
    out.write(vhi_url.read())
    out.close()
    
    print("File {} is ready".format(n))

In [3]:
get_data(1)

File 1 is ready


In [4]:
def get_data_all():
    n = 1
    #indexes of the areas
    while n < 28:
        get_data(n)
        n = n + 1

In [5]:
import glob

def dataframe(PATH):
    headers = ['year', 'week', 'smn', 'smt', 'vci', 'tci', 'vhi', 'empty']
    #column names that are needed
    headers_active = ['year', 'week', 'smn', 'smt', 'vci', 'tci', 'vhi']
    #looking for files in current dir
    all_files = glob.glob(PATH + "/vhi_id*.csv")
    li = []
    i = 1
    for filename in all_files:
        df = pd.read_csv(filename, index_col = None, header = 1, names = headers, usecols = headers_active)
        # data-cleaning block
        #drop rows which have -1 in vhi column
        df = df.drop(df.loc[df['vhi'] == -1].index)
        #drop rows with na
        df = df.dropna()
        #adding index in order to find needed area in dataset
        df['area'] = i
        #adding data from file to list
        li.append(df)
        #counter
        i = i + 1
        
        
    
    #stack tables
    frame = pd.concat(li, axis = 0, ignore_index = True) 
    return frame

In [6]:
def change_index(frame):
    old = 1
    #list of areas from the task
    areas_list = ["22", "24", "23", "25", "3", "4", "8", "19", "20", "21", "9", "26", "10", "11", "12", "13", "14", "15", "16", "27", "17", "18", "6", "1", "2", "7", "5", "eof"]
    #parsing the list above
    for new in areas_list:
        frame["area"].replace({old:new}, inplace = True)
        old = old + 1
    #creating csv file from the current dataframe
    frame.to_csv('vhi_full.csv')
    print("Indexes were changed")

In [7]:
def area_vhi(frame, area, year):
    #list for vhi values
    vhi_list = []
    
    #creating new frame with needed data
    frame_area = frame[(frame["area"] == area) & (frame["year"] == year)]
    #appending cells data to the list
    for record in frame_area["vhi"]:
        vhi_list.append(record)
    
    answ = input("Do you want to see the whole table for that year?(y)")
    if answ == "y" or answ == "Y":
        print(frame_area)
    
    print("Vhi list:", vhi_list)
    
    print("Max value is: {}".format(frame_area["vhi"].max()))
    print("Min value is: {}".format(frame_area["vhi"].min()))

In [8]:
def drought(area, perc, _type):
    #creating new frame with needed data
    frame_vhi = frame[frame["area"] == area]
    years = []
    bad_years = []
    
    #list of years actually used in research
    for record in frame_vhi["year"]:
        if record not in years:
            years.append(record)
    
    #getting info for each of the years from the list above
    for year in years:
        bad_weeks = 0 
        frame_years = frame_vhi[frame["year"] == year]
        #getting the quantity of weeks as data-cleaning block may change it
        weeks = len(frame_years)
        
        #checking every single cell from vhi column
        for vhi_data in frame_years["vhi"]:
            #adding +1 to bad_weeks when we find such a week
            if vhi_data < _type:
                bad_weeks += 1
        #steps
        print(year, "-", bad_weeks, "-", weeks)    
        #% for each year
        percentage = (bad_weeks/weeks)*100
        print("% of the area:" , percentage)
        if percentage > perc:
            bad_years.append(year)
    print("Bad years are:", bad_years)

In [9]:
get_data_all()

File 1 is ready
File 2 is ready
File 3 is ready
File 4 is ready
File 5 is ready
File 6 is ready
File 7 is ready
File 8 is ready
File 9 is ready
File 10 is ready
File 11 is ready
File 12 is ready
File 13 is ready
File 14 is ready
File 15 is ready
File 16 is ready
File 17 is ready
File 18 is ready
File 19 is ready
File 20 is ready
File 21 is ready
File 22 is ready
File 23 is ready
File 24 is ready
File 25 is ready
File 26 is ready
File 27 is ready


In [26]:
frame = dataframe(PATH)

In [27]:
change_index(frame)

Indexes were changed


In [28]:
area_vhi(testdf, 2, 2002)

Do you want to see the whole table for that year?(y)y
       Unnamed: 0  year  week    smn     smt    vci    tci    vhi  area
49289       49289  2002   1.0  0.074  258.60  28.89  66.40  47.64     2
49290       49290  2002   2.0  0.068  259.67  24.13  64.98  44.55     2
49291       49291  2002   3.0  0.073  262.05  26.40  56.33  41.37     2
49292       49292  2002   4.0  0.081  264.56  30.91  46.36  38.63     2
49293       49293  2002   5.0  0.094  267.20  37.82  37.58  37.70     2
49294       49294  2002   6.0  0.111  269.51  47.00  33.83  40.42     2
49295       49295  2002   7.0  0.134  271.76  59.25  31.47  45.36     2
49296       49296  2002   8.0  0.155  273.91  69.18  28.11  48.65     2
49297       49297  2002   9.0  0.177  276.26  78.21  24.74  51.48     2
49298       49298  2002  10.0  0.197  278.48  82.93  24.44  53.68     2
49299       49299  2002  11.0  0.219  280.69  84.07  25.97  55.02     2
49300       49300  2002  12.0  0.238  282.47  82.25  27.87  55.06     2
49301     