In [1]:
# Importing Pandas for data manipulation operations.
import pandas as pd
# Importing NumPy for numerical operations.
import numpy as np
# Importing GeoPandas for map based operations.
import geopandas as gpd
# Importing Regex to use regular expressions. 
import regex as re
# Importing Warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing the datasets from CSV files into pandas dataframes.
fisher_df_1 = pd.read_csv("DATASETS\Fisher_slno.1-101.csv",     low_memory = False)
fisher_df_2 = pd.read_csv("DATASETS\Fisher_slno.102-4291.csv",  low_memory = False)
fisher_df_3 = pd.read_csv("DATASETS\Fisher_slno.4292-7217.csv", low_memory = False)

# Importing the fish labels provided by the BAU team for later use.
fish_labels = pd.read_csv("DATASETS/fish_species.csv",            low_memory = False)
district_labels = pd.read_csv("DATASETS/new_district_labels.csv", low_memory = False)

# Importing the shape file for districts.
GDF = gpd.read_file(f'DATASETS/shape_files/shape.shp')

In [3]:
# Dictionary of months in a specific order, starting from April and ending in March.
MONTH_MAPPING = {
    1: 'January--Magh', 2: 'February--Falgun', 3: 'March--Chaitra', 4: 'April--Boishakh', 
    5: 'May--Jeystho', 6: 'June--Asharh', 7: 'July--Srabon', 
    8: 'August--Bhadro', 9: 'September--Ashwin', 10: 'October--Kartik', 
    11: 'November--Aghrahan', 12: 'December--Poush'
}

# List of months in a specific order, starting from April and ending in March.
MONTHS = [
    'January--Magh',
    'February--Falgun',
    'March--Chaitra',
    'April--Boishakh',
    'May--Jeystho',
    'June--Asharh',
    'July--Srabon',
    'August--Bhadro',
    'September--Ashwin',
    'October--Kartik',
    'November--Aghrahan',
    'December--Poush'
]

# Dictionary mapping numeric keys to various fishing sources from the survey.
SOURCE = {
    1:  "Marsh", 2:  "Haor", 3:  "Canal", 4:  "River", 
    5:  "Mohona", 6:  "River (Cultivation)", 7:  "Pond",
    8:  "Seasonal Cultivation", 9:  "Fish Farming in Cages",
    10: "Pen Culture (Net)", 11: "Flooded Reservoirs", 99: "Others"
}

# Dictionary mapping numeric keys to reasons for fish loss from the survey.
REASONS = {
    1: "Damage During Harvesting", 2: "Too Long In Nets (Physical Damage)",
    3: "High Temperature, Delay In Taking To Market", 4: "Not Enough Ice or Insulated Containers",
    5: "Inadequacy of Fish Preservation Materials", 6: "Inadequate Cold Storage Facilities",
    7: "Inadequacy of Communication Systems", 8: "Spoilage of Fish During Transportation",
    9: "Loss of Fish From Unloading & Loading", 10: "Result of Medication Used On Fish",
    99: "Other Unrecorded Reason For Loss"
}

In [4]:
# Convert the 'Species_Name' column of the 'fish_labels' DataFrame into a Series.
FISH_LABELS = pd.Series(
    fish_labels.Species_Name.values, 
    index = fish_labels.Fish_Species_Serial_Number
).to_dict()

# Convert the 'New_Labels' column of the 'zila_labels' DataFrame into a Series.
DISTRICT_LABELS = pd.Series(
    district_labels.New_Labels.values, index = district_labels.Old_Labels
).to_dict()

In [5]:
# Mapping the district labels to the orginal 'q1_d_zila' labels.
fisher_df_1['q1_d_zila'] = fisher_df_1['q1_d_zila'].map(DISTRICT_LABELS)
fisher_df_2['q1_d_zila'] = fisher_df_2['q1_d_zila'].map(DISTRICT_LABELS)
fisher_df_3['q1_d_zila'] = fisher_df_3['q1_d_zila'].map(DISTRICT_LABELS)

# Renaming the 'ADM2_EN' columns to 'q1_d_zila'.
GDF.rename(columns = {'ADM2_EN':'q1_d_zila'}, inplace = True)

#### <b>Overview of Fishing Techniques Used to Harvest Fish - (Question 3)</b>

In [6]:
# Selecting columns that represent the source of fishing from the three DataFrames and District.
source_of_fishing1  = fisher_df_1.iloc[:, [8] + list(range(22, 27))]
source_of_fishing2  = fisher_df_2.iloc[:, [8] + list(range(22, 27))]
source_of_fishing3  = fisher_df_3.iloc[:, [8] + list(range(22, 27))]

# Concatenating the source columns from all three DataFrames into one single DataFrame.
source_of_fishing = pd.concat([
    source_of_fishing1, source_of_fishing2, source_of_fishing3
])

# Resetting the index of the new DataFrame.
source_of_fishing.reset_index(drop = True, inplace = True)

In [7]:
# Displaying the DataFrame's shape and size.
display(
    source_of_fishing.shape, 
    source_of_fishing.head(5)
)

(7217, 6)

Unnamed: 0,q1_d_zila,q3_1,q3_2,q3_3,q3_4,q3_5
0,Rajshahi,6.0,,,,
1,Rajshahi,6.0,,,,
2,Rajshahi,6.0,,,,
3,Rajshahi,6.0,,,,
4,Rajshahi,6.0,,,,


In [8]:
for col in ['q3_1', 'q3_2', 'q3_3', 'q3_4', 'q3_5']:
    source_of_fishing[col] = source_of_fishing[col].map(SOURCE)

df_melted = source_of_fishing.melt(
    id_vars = ['q1_d_zila'], 
    value_vars = ['q3_1', 'q3_2', 'q3_3', 'q3_4', 'q3_5'], 
    var_name = 'source_type', value_name = 'Source'
)

df_melted = df_melted.dropna(subset = ['Source'])

SOURCE_OF_FISHING_GEO_DF = df_melted.groupby(
    ['q1_d_zila', 'Source']
).size().unstack(fill_value = 0)

In [9]:
# Displaying the DataFrame's shape and size.
display(
    SOURCE_OF_FISHING_GEO_DF.shape, 
    SOURCE_OF_FISHING_GEO_DF.head(5)
)

(63, 12)

Source,Canal,Fish Farming in Cages,Flooded Reservoirs,Haor,Marsh,Mohona,Others,Pen Culture (Net),Pond,River,River (Cultivation),Seasonal Cultivation
q1_d_zila,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Bagerhat,7,0,0,0,15,2,24,0,0,83,62,0
Barguna,0,0,0,0,0,0,0,0,1,5,10,0
Barisal,1,0,0,0,3,0,5,0,1,80,56,0
Bhola,0,0,0,0,0,0,0,0,0,231,9,0
Bogra,0,0,0,0,2,0,2,0,5,93,79,0


In [10]:
# Saving the DataFrame to a CSV file.
SOURCE_OF_FISHING_GEO_DF.to_csv(
    'DATASETS/Cleaned_Data/GEO_DATA/Q3_SOURCE_OF_FISHING.csv'
)

#### <b>Analysing Annual Catch Volumes and Species-Specific Harvest Data - (Question 4)</b>

In [11]:
annual_catch_totals1 = fisher_df_1.iloc[:, [8] + list(range(41, 181))]                
annual_catch_totals2 = fisher_df_2.iloc[:, [8] + list(range(41, 181))]
annual_catch_totals3 = fisher_df_3.iloc[:, [8] + list(range(41, 181))]

annual_catch_totals = pd.concat([
    annual_catch_totals1, annual_catch_totals2, annual_catch_totals3
])

annual_catch_totals.reset_index(drop = True, inplace = True)

In [12]:
# Displaying the DataFrame's shape and size.
display(
    annual_catch_totals.shape, 
    annual_catch_totals.head(5)
)

(7217, 141)

Unnamed: 0,q1_d_zila,q4_1_n,q4_f_1_1,q4_f_1_2,q4_f_1_3,q4_f_1_4,q4_f_1_5,q4_f_1_6,q4_f_1_7,q4_f_1_8,...,q4_f_10_4,q4_f_10_5,q4_f_10_6,q4_f_10_7,q4_f_10_8,q4_f_10_9,q4_f_10_10,q4_f_10_11,q4_f_10_12,q4_f_10_t
0,Rajshahi,1,5000.0,5000.0,5000.0,500.0,500.0,500.0,500.0,800.0,...,0,0,0,0,0,0,0,0,0,0
1,Rajshahi,1,1200.0,1200.0,1200.0,500.0,500.0,500.0,500.0,500.0,...,0,0,0,0,0,0,0,0,0,0
2,Rajshahi,1,2000.0,2000.0,2000.0,500.0,500.0,500.0,500.0,500.0,...,0,0,0,0,0,0,0,0,0,0
3,Rajshahi,1,2400.0,2400.0,2400.0,800.0,800.0,800.0,800.0,800.0,...,0,0,0,0,0,0,0,0,0,0
4,Rajshahi,1,1500.0,0.0,0.0,0.0,300.0,0.0,300.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Iterating over the range from (1 -> 10).
for x in range(1, 11):
    column_name = f'q4_{x}_n'
    
    # Checking if the column name exists in the dataframe
    if column_name in annual_catch_totals.columns:
        # Mapping the values in the column to their corresponding labels in FISH_LABELS.
        annual_catch_totals[column_name] = annual_catch_totals[column_name].map(FISH_LABELS)

# Iterating over all the columns in the dataframe.
for z in annual_catch_totals.columns:
    # Checking if the column name contains '_n'.
    if '_n' in z:
        # Filling NaN values in the column with 'Other Species'.
        annual_catch_totals[z].fillna('Other Species', inplace = True)

In [14]:
# Displaying the DataFrame's shape and size.
display(
    annual_catch_totals.shape, 
    annual_catch_totals.head(5)
)

(7217, 141)

Unnamed: 0,q1_d_zila,q4_1_n,q4_f_1_1,q4_f_1_2,q4_f_1_3,q4_f_1_4,q4_f_1_5,q4_f_1_6,q4_f_1_7,q4_f_1_8,...,q4_f_10_4,q4_f_10_5,q4_f_10_6,q4_f_10_7,q4_f_10_8,q4_f_10_9,q4_f_10_10,q4_f_10_11,q4_f_10_12,q4_f_10_t
0,Rajshahi,Rui,5000.0,5000.0,5000.0,500.0,500.0,500.0,500.0,800.0,...,0,0,0,0,0,0,0,0,0,0
1,Rajshahi,Rui,1200.0,1200.0,1200.0,500.0,500.0,500.0,500.0,500.0,...,0,0,0,0,0,0,0,0,0,0
2,Rajshahi,Rui,2000.0,2000.0,2000.0,500.0,500.0,500.0,500.0,500.0,...,0,0,0,0,0,0,0,0,0,0
3,Rajshahi,Rui,2400.0,2400.0,2400.0,800.0,800.0,800.0,800.0,800.0,...,0,0,0,0,0,0,0,0,0,0
4,Rajshahi,Rui,1500.0,0.0,0.0,0.0,300.0,0.0,300.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Extracting columns related to fish catches.
fish_columns = [
    col for col in annual_catch_totals.columns if re.match(r'q4_f_\d+_\d+_\d+', col)
]

# Extracting columns related to fish catches.
year_total_columns = [
    col for col in annual_catch_totals.columns if col.endswith('_t')
]

# Initialising the data structure for the final dataframe.
data_structure = {
    'District': annual_catch_totals['q1_d_zila']
}

In [16]:
# Adding month columns initialised with zeros.
for month in MONTH_MAPPING.values():
    data_structure[month] = [0] * len(annual_catch_totals)

# Adding a 'Year Total' column initialised with zeros.
data_structure['Year Total'] = [0] * len(annual_catch_totals)

# Converting the data structure to a dataframe.
final_df = pd.DataFrame(data_structure)

In [17]:
# Populate the dataframe with the monthly and yearly totals.
for idx, row in annual_catch_totals.iterrows():

    yearly_total = 0
    monthly_totals = {month: 0 for month in MONTH_MAPPING.values()}
    
    # Iterating over each column in the dataframe.
    for col in annual_catch_totals.columns:

        # Using regex to match column names with pattern 'q4_f_<number>_<month_number>'.
        match = re.match(r'q4_f_\d+_(\d+)', col)
        if match:
            # Extracting the month number from the match.
            month_num = int(match.group(1))
            # Getting the month name from the MONTH_MAPPING dictionary.
            month_name = MONTH_MAPPING.get(month_num)
            if month_name:
                 # Adding the value in the current cell to the appropriate month total.
                monthly_totals[month_name] += row[col]
    
    # Calculating the yearly total by summing up values from the specified year total columns.
    for col in year_total_columns:
        yearly_total += row[col]

    # Update the 'final_df' with the monthly totals converted to metric tonnes.
    for month in MONTH_MAPPING.values():
        final_df.at[idx, month] = monthly_totals[month] / 1000
    
    # Update the 'final_df' with the yearly total converted to metric tonnes.
    final_df.at[idx, 'Year Total'] = yearly_total / 1000

In [18]:
# Displaying the DataFrame's shape and size.
display(
    final_df.shape, 
    final_df.head(5)
)

(7217, 14)

Unnamed: 0,District,January--Magh,February--Falgun,March--Chaitra,April--Boishakh,May--Jeystho,June--Asharh,July--Srabon,August--Bhadro,September--Ashwin,October--Kartik,November--Aghrahan,December--Poush,Year Total
0,Rajshahi,16.0,16.0,16.0,1.6,1.6,1.6,1.7,2.4,2.4,2.4,1.6,1.6,64.8
1,Rajshahi,4.0,4.0,4.0,1.6,1.6,1.6,1.6,1.6,1.6,1.6,1.6,1.6,26.4
2,Rajshahi,8.0,8.0,8.0,1.6,1.6,1.6,1.6,1.6,1.6,1.6,1.6,1.6,38.4
3,Rajshahi,9.0,9.0,9.0,2.7,2.7,2.7,2.7,2.7,2.7,2.7,2.7,1.8,50.4
4,Rajshahi,3.0,0.0,0.0,0.0,0.6,0.0,0.6,0.0,0.0,2.0,2.0,3.0,11.2


In [19]:
monthly_totals = {month: final_df[month].sum() for month in MONTH_MAPPING.values()}

# Create a DataFrame from the monthly totals.
monthly_totals_df = pd.DataFrame(
    list(monthly_totals.items()), 
    columns = ['Month', 'Total (in thousands)']
).round(2)

# Grouping by 'District' and calculating the sum and count of values for each district.
grouped_df = final_df.groupby('District').agg(['sum']).round(2).reset_index()
grouped_df_count = final_df.groupby('District').agg(['count']).reset_index()
instances = grouped_df_count.iloc[:, -1]

grouped_df = pd.concat([grouped_df, instances], axis = 1)
grouped_df.to_csv("DATASETS/Cleaned_Data/GEO_DATA/grouped_df.csv", index = False)
grouped_df = pd.read_csv("DATASETS/Cleaned_Data/GEO_DATA/grouped_df.csv")
grouped_df = grouped_df.drop(0).reset_index(drop = True)

In [20]:
# Displaying the DataFrame's shape and size.
display(
    monthly_totals_df.shape, 
    monthly_totals_df.head(5)
)

(12, 2)

Unnamed: 0,Month,Total (in thousands)
0,January--Magh,6669.97
1,February--Falgun,5987.6
2,March--Chaitra,5083.48
3,April--Boishakh,5326.34
4,May--Jeystho,6931.92


In [21]:
# Displaying the DataFrame's shape and size.
display(
    grouped_df.shape, 
    grouped_df.head(5)
)

(63, 15)

Unnamed: 0,District,January--Magh,February--Falgun,March--Chaitra,April--Boishakh,May--Jeystho,June--Asharh,July--Srabon,August--Bhadro,September--Ashwin,October--Kartik,November--Aghrahan,December--Poush,Year Total,Year Total.1
0,Bagerhat,60.14,59.46,134.28,117.31,145.64,148.92,300.15,304.54,348.2,453.93,232.39,86.02,2328.28,193
1,Barguna,19.65,1.84,3.37,2.94,16.19,4.09,3.12,25.72,3.28,2.09,13.78,8.94,105.72,16
2,Barisal,55.49,20.01,17.61,31.02,44.15,40.49,29.71,90.31,35.82,29.63,52.95,47.09,493.79,143
3,Bhola,19.61,137.1,249.24,237.54,246.23,65.15,163.63,163.73,185.27,184.99,199.02,10.9,1852.89,240
4,Bogra,134.77,128.93,101.1,98.49,82.94,47.67,105.86,124.61,52.39,103.36,173.94,69.96,1253.05,181


In [22]:
# Converting data to per capita (district = catch (mt) / instances)
grouped_df[MONTHS] = grouped_df[MONTHS].apply(pd.to_numeric)
grouped_df['Year Total'] = grouped_df['Year Total'].apply(pd.to_numeric)
grouped_df['Year Total.1'] = pd.to_numeric(grouped_df['Year Total.1'])

for month in MONTHS:
    grouped_df[month] = grouped_df[month] / grouped_df['Year Total.1']

grouped_df['Year Total'] = grouped_df['Year Total'] / grouped_df['Year Total.1']
grouped_df = grouped_df.drop('Year Total.1',axis = 1)

MONTHLY_CATCH_DF = grouped_df.round(2)

In [23]:
# Displaying the DataFrame's shape and size.
display(
    MONTHLY_CATCH_DF.shape, 
    MONTHLY_CATCH_DF.head(5)
)

(63, 14)

Unnamed: 0,District,January--Magh,February--Falgun,March--Chaitra,April--Boishakh,May--Jeystho,June--Asharh,July--Srabon,August--Bhadro,September--Ashwin,October--Kartik,November--Aghrahan,December--Poush,Year Total
0,Bagerhat,0.31,0.31,0.7,0.61,0.75,0.77,1.56,1.58,1.8,2.35,1.2,0.45,12.06
1,Barguna,1.23,0.12,0.21,0.18,1.01,0.26,0.2,1.61,0.2,0.13,0.86,0.56,6.61
2,Barisal,0.39,0.14,0.12,0.22,0.31,0.28,0.21,0.63,0.25,0.21,0.37,0.33,3.45
3,Bhola,0.08,0.57,1.04,0.99,1.03,0.27,0.68,0.68,0.77,0.77,0.83,0.05,7.72
4,Bogra,0.74,0.71,0.56,0.54,0.46,0.26,0.58,0.69,0.29,0.57,0.96,0.39,6.92


In [24]:
# Saving the DataFrame to a CSV file.
MONTHLY_CATCH_DF.to_csv(
    'DATASETS/Cleaned_Data/GEO_DATA/Q4_MONTHLY_CATCH.csv', 
    index = False
)