# 2014-2015

In [92]:
import pandas as pd

df_2014 = pd.read_csv('./2014-15.csv')
df_2015 = pd.read_csv('./2015-16.csv')

In [93]:
import pandas as pd


def clean_df(df, columns_to_exclude, name):
    columns_to_convert = [col for col in df.columns if col not in columns_to_exclude]

    for col in columns_to_convert:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            df[col] = df[col].str.replace(',', '.').astype(float)

    # Save the corrected DataFrame to a new CSV file
    df = df.drop(['SN'], axis=1)
    df.to_csv(name, index=False)

clean_df(df_2014, ['District', 'Location'], "2014-15_corrected.csv")
clean_df(df_2015, ["Well D","District","Taluka","Location","Type"], "2015-16_corrected.csv")

# 2016

In [94]:
import glob

# Get all the CSV files in the current directory
files = glob.glob('table_data/2016/*.csv')
files

['table_data/2016/output_table_Gujarat_State_Year_Book_2016-17_page_7.csv',
 'table_data/2016/output_table_Gujarat_State_Year_Book_2016-17_page_6.csv',
 'table_data/2016/output_table_Gujarat_State_Year_Book_2016-17_page_4.csv',
 'table_data/2016/output_table_Gujarat_State_Year_Book_2016-17_page_5.csv',
 'table_data/2016/output_table_Gujarat_State_Year_Book_2016-17_page_1.csv',
 'table_data/2016/output_table_Gujarat_State_Year_Book_2016-17_page_2.csv',
 'table_data/2016/output_table_Gujarat_State_Year_Book_2016-17_page_3.csv']

In [95]:
df_ls = []

for file in files:
    df_ls.append(pd.read_csv(file))


df_2016 = pd.concat(df_ls)
df_2016

Unnamed: 0,District,Location,pH,EC,TDS,CO3,HCO3,Cl,NO3,SO4,F,Alk,Ca,Mg,TH,Na,K
0,Surat,Malda,7.76,1484.0,994,0.0,622,142,23,5.0,0.53,510,72,92.0,560.0,76.0,18.4
1,Surat,Mandvi2,7.82,1096.0,734,0.0,366,121,55,32.0,0.50,300,80,49.0,400.0,70.0,0.5
2,Surat,Mota,8.01,2812.0,1884,0.0,671,540,16,70.0,1.25,550,48,68.0,400.0,465.0,1.9
3,Surat,Moti Sarkui,7.79,720.0,482,0.0,305,57,26,18.0,0.33,250,24,39.0,220.0,70.0,0.4
4,Surat,Motichher,7.91,512.0,343,0.0,244,28,25,13.0,0.35,200,48,27.0,236.0,18.0,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,Kachchh,Bhadreshwar,8.09,1762.0,1181,0.0,329,425,6,10.0,1.80,270,132,26.0,440.0,210.0,9.2
77,Kachchh,Bhuj1,8.2,743.0,498,0.0,281,106,6,5.0,0.48,230,88,7.0,250.0,66.0,9.0
78,Kachchh,Chandranagar,8.06,3904.0,2616,0.0,366,1170,2,96.0,1.35,300,180,144.0,1050.0,472.0,13.0
79,Kachchh,Deshalpur rapar,8.2,1480.0,992,0.0,146,291,8,217.0,0.05,120,52,62.0,390.0,174.0,16.7


In [96]:
df_2016.to_csv('2016-17_corrected.csv', index=False)

# Merging and creating super data

In [97]:
df_2014 = pd.read_csv('2014-15_corrected.csv')
df_2015 = pd.read_csv('2015-16_corrected.csv')
df_2016 = pd.read_csv('2016-17_corrected.csv')

In [98]:
# rename column names to upper case
df_2014.columns = [col.upper() for col in df_2014.columns]
df_2015.columns = [col.upper() for col in df_2015.columns]
df_2016.columns = [col.upper() for col in df_2016.columns]

In [99]:
df_2014.columns

Index(['DISTRICT', 'LOCATION', 'PH', 'EC', 'TDS', 'CO3', 'HCO3', 'CI', 'NO3',
       'SO4', 'F', 'ALK', 'CA', 'MG', 'TH', 'NA', 'K', 'FE', 'SAR'],
      dtype='object')

In [100]:
df_2015.columns

Index(['WELL D', 'DISTRICT', 'TALUKA', 'LOCATION', 'TYPE', 'PH', 'EC', 'TDS',
       'CO3', 'HCO3', 'CI', 'NO3', 'SO4', 'F', 'ALK', 'CA', 'MG', 'TH', 'NA',
       'K', 'SAR', 'ARSENIC', 'UNNAMED: 23'],
      dtype='object')

In [101]:
df_2016.columns

Index(['DISTRICT', 'LOCATION', 'PH', 'EC', 'TDS', 'CO3', 'HCO3', 'CL', 'NO3',
       'SO4', 'F', 'ALK', 'CA', 'MG', 'TH', 'NA', 'K'],
      dtype='object')

In [102]:
df_2014["YEAR"] = 2014
df_2015["YEAR"] = 2015
df_2016["YEAR"] = 2016

In [103]:
# collect common columns from all three dataframes
common_cols = list(set(df_2014.columns) & set(df_2015.columns) & set(df_2016.columns))
common_cols

['HCO3',
 'PH',
 'YEAR',
 'NO3',
 'F',
 'NA',
 'CO3',
 'LOCATION',
 'ALK',
 'CA',
 'DISTRICT',
 'K',
 'MG',
 'TH',
 'EC',
 'SO4',
 'TDS']

In [104]:
# filter out the common columns from all three dataframes and create a merged dataframe
df_2014 = df_2014[common_cols]
df_2015 = df_2015[common_cols]
df_2016 = df_2016[common_cols]

# merge all three dataframes
df_combined = pd.concat([df_2014, df_2015, df_2016], ignore_index=True)
df_combined

Unnamed: 0,HCO3,PH,YEAR,NO3,F,NA,CO3,LOCATION,ALK,CA,DISTRICT,K,MG,TH,EC,SO4,TDS
0,1342.0,7.69,2014,17.0,3.25,1020.0,0.0,Barvala,1100.0,40.0,Ahmedabad,0.4,48.0,300.0,4781.0,270.0,3203.0
1,354.0,7.56,2014,12.0,2.20,140.0,0.0,Dhandhuka1,290.0,24.0,Ahmedabad,17.2,10.0,100.0,790.0,36.0,529.0
2,781.0,7.9,2014,16.0,0.25,416.0,0.0,Gamph,640.0,28.0,Ahmedabad,37.0,72.0,370.0,2550.0,10.0,1709.0
3,439.0,7.8,2014,17.0,1.75,210.0,0.0,Tagadi1,360.0,32.0,Ahmedabad,5.2,31.0,210.0,1492.0,15.0,1000.0
4,1232.0,7.73,2014,45.0,3.10,2940.0,0.0,Dalod,1010.0,180.0,Ahmedabad,33.0,216.0,1350.0,14520.0,439.0,9728.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1797,329.0,8.09,2016,6.0,1.80,210.0,0.0,Bhadreshwar,270.0,132.0,Kachchh,9.2,26.0,440.0,1762.0,10.0,1181.0
1798,281.0,8.2,2016,6.0,0.48,66.0,0.0,Bhuj1,230.0,88.0,Kachchh,9.0,7.0,250.0,743.0,5.0,498.0
1799,366.0,8.06,2016,2.0,1.35,472.0,0.0,Chandranagar,300.0,180.0,Kachchh,13.0,144.0,1050.0,3904.0,96.0,2616.0
1800,146.0,8.2,2016,8.0,0.05,174.0,0.0,Deshalpur rapar,120.0,52.0,Kachchh,16.7,62.0,390.0,1480.0,217.0,992.0


In [105]:
# columns not found: CL, SAR

In [106]:
eval_data = pd.read_csv("./refined_data/eval_data.csv")
eval_data

Unnamed: 0,DISTRICT,LOCATION,PH,EC,TDS,TH,CA,MG,NA,K,CO3,HCO3,CL,NO3,SO4,F,ALK,YEAR,SAR
0,Ahmedabad,Barvala,8.25,5090.0,3410.0,450.0,80.0,60.0,937.0,0.3,0.0,891.0,959.0,24.00,346.0,2.80,730.0,2017,
1,Ahmedabad,Dhandhuka1,8.23,14210.0,9521.0,1300.0,180.0,207.0,3220.0,9.9,0.0,1013.0,3373.0,1600.00,1290.0,6.64,830.0,2017,
2,Ahmedabad,Endla,8.17,685.0,459.0,220.0,52.0,22.0,49.0,13.6,0.0,305.0,50.0,2.00,2.0,0.44,250.0,2017,
3,Ahmedabad,Kumarkhan,8.19,10660.0,7142.0,800.0,140.0,109.0,1875.0,7.2,0.0,988.0,2485.0,32.00,443.0,11.20,810.0,2017,
4,Ahmedabad,Kundali,8.03,1300.0,871.0,280.0,64.0,29.0,210.0,3.7,0.0,305.0,85.0,230.00,171.0,0.50,250.0,2017,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,Kutchch,Desalpur,7.42,768.0,515.0,180.0,40.0,19.0,95.0,10.0,0.0,183.0,113.0,3.20,70.0,0.15,150.0,2021,3.08
2760,Kutchch,Ratanpar Khadir,7.73,4419.0,2961.0,701.0,96.0,112.0,712.0,17.0,0.0,403.0,1064.0,13.00,310.0,0.78,330.0,2021,11.70
2761,Kutchch,Rapar,7.60,2942.0,1971.0,400.0,132.0,17.0,552.0,26.0,0.0,281.0,567.0,33.00,538.0,0.93,230.0,2021,12.00
2762,Kutchch,Kuda,7.35,1464.0,981.0,410.0,104.0,36.0,125.0,18.0,0.0,610.0,142.0,0.31,25.0,1.04,500.0,2021,2.68


In [107]:
# common columns between df and eval_data
common_cols = list(set(df_combined.columns) & set(eval_data.columns))
common_cols

# keep district and location columns first in the list
common_cols = ['DISTRICT', 'LOCATION'] + [col for col in common_cols if col not in ['DISTRICT', 'LOCATION']]
common_cols

['DISTRICT',
 'LOCATION',
 'HCO3',
 'PH',
 'YEAR',
 'F',
 'NO3',
 'NA',
 'CO3',
 'ALK',
 'CA',
 'K',
 'MG',
 'TH',
 'EC',
 'SO4',
 'TDS']

In [108]:
# order the columns of both dataframes based on common columns
df_combined = df_combined[common_cols]
eval_data = eval_data[common_cols]

# merge the two dataframes

df = pd.concat([df_combined, eval_data], ignore_index=True)
df

Unnamed: 0,DISTRICT,LOCATION,HCO3,PH,YEAR,F,NO3,NA,CO3,ALK,CA,K,MG,TH,EC,SO4,TDS
0,Ahmedabad,Barvala,1342.0,7.69,2014,3.25,17.00,1020.0,0.0,1100.0,40.0,0.4,48.0,300.0,4781.0,270.0,3203.0
1,Ahmedabad,Dhandhuka1,354.0,7.56,2014,2.20,12.00,140.0,0.0,290.0,24.0,17.2,10.0,100.0,790.0,36.0,529.0
2,Ahmedabad,Gamph,781.0,7.9,2014,0.25,16.00,416.0,0.0,640.0,28.0,37.0,72.0,370.0,2550.0,10.0,1709.0
3,Ahmedabad,Tagadi1,439.0,7.8,2014,1.75,17.00,210.0,0.0,360.0,32.0,5.2,31.0,210.0,1492.0,15.0,1000.0
4,Ahmedabad,Dalod,1232.0,7.73,2014,3.10,45.00,2940.0,0.0,1010.0,180.0,33.0,216.0,1350.0,14520.0,439.0,9728.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,Kutchch,Desalpur,183.0,7.42,2021,0.15,3.20,95.0,0.0,150.0,40.0,10.0,19.0,180.0,768.0,70.0,515.0
4562,Kutchch,Ratanpar Khadir,403.0,7.73,2021,0.78,13.00,712.0,0.0,330.0,96.0,17.0,112.0,701.0,4419.0,310.0,2961.0
4563,Kutchch,Rapar,281.0,7.6,2021,0.93,33.00,552.0,0.0,230.0,132.0,26.0,17.0,400.0,2942.0,538.0,1971.0
4564,Kutchch,Kuda,610.0,7.35,2021,1.04,0.31,125.0,0.0,500.0,104.0,18.0,36.0,410.0,1464.0,25.0,981.0


In [109]:
# concat the two data based on common columns
df = pd.concat([df_combined[common_cols], eval_data[common_cols]], ignore_index=True)
df

Unnamed: 0,DISTRICT,LOCATION,HCO3,PH,YEAR,F,NO3,NA,CO3,ALK,CA,K,MG,TH,EC,SO4,TDS
0,Ahmedabad,Barvala,1342.0,7.69,2014,3.25,17.00,1020.0,0.0,1100.0,40.0,0.4,48.0,300.0,4781.0,270.0,3203.0
1,Ahmedabad,Dhandhuka1,354.0,7.56,2014,2.20,12.00,140.0,0.0,290.0,24.0,17.2,10.0,100.0,790.0,36.0,529.0
2,Ahmedabad,Gamph,781.0,7.9,2014,0.25,16.00,416.0,0.0,640.0,28.0,37.0,72.0,370.0,2550.0,10.0,1709.0
3,Ahmedabad,Tagadi1,439.0,7.8,2014,1.75,17.00,210.0,0.0,360.0,32.0,5.2,31.0,210.0,1492.0,15.0,1000.0
4,Ahmedabad,Dalod,1232.0,7.73,2014,3.10,45.00,2940.0,0.0,1010.0,180.0,33.0,216.0,1350.0,14520.0,439.0,9728.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,Kutchch,Desalpur,183.0,7.42,2021,0.15,3.20,95.0,0.0,150.0,40.0,10.0,19.0,180.0,768.0,70.0,515.0
4562,Kutchch,Ratanpar Khadir,403.0,7.73,2021,0.78,13.00,712.0,0.0,330.0,96.0,17.0,112.0,701.0,4419.0,310.0,2961.0
4563,Kutchch,Rapar,281.0,7.6,2021,0.93,33.00,552.0,0.0,230.0,132.0,26.0,17.0,400.0,2942.0,538.0,1971.0
4564,Kutchch,Kuda,610.0,7.35,2021,1.04,0.31,125.0,0.0,500.0,104.0,18.0,36.0,410.0,1464.0,25.0,981.0


In [111]:
df['YEAR'].value_counts().sort_index()

YEAR
2014    641
2015    599
2016    562
2017    578
2018    518
2019    529
2020    601
2021    538
Name: count, dtype: int64

In [112]:
df.to_csv("super_data.csv", index=False)