In [None]:
import pandas as pd
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

df = pd.read_stata("../Datasets/Anonymized data/1_Identification_ano.dta")

In [None]:
df.head()

In [None]:
df.groupby('state_id')['COMID'].unique()

In [None]:
# Column renaming mapping
rename_dict = {
    "state_id":"state",
    "COMID" : 'comid',
    "c2q4__1": "agro_processing",
    "c2q5__1": "milling_facility",
    "c2q5__2": "feed_mill",
    "c2q5__3": "corn_husker",
    "c2q5__4": "cocoa_mill",
    "c2q5__5": "palm_oil_mill",
    "c2q5__6": "rice_husker",
    #"c2q5__8": "crop_conservation_facility",
    "c2q5__11": "crop_processing_facility",
    "c2q6a": "electricity_available",
    "c2bq1": "land_degradation_pct",
    "c2bq1b": "increase_land_degradation",  #
    "c2bq3a": "abandoned_farm_pct",
    "c2bq4__6": "land_issue_soil_erosion",
    "c2bq4__9": "land_issue_poor_roads",
    "c2bq4__12": "land_issue_other",
    "c2cq2": "produce_transport_method",
    "c2dq1__4": "extension_services_present",
    "c2dq1__8": "storage_facility_available",
    "c2fq1__5": "challenge_crop_diseases",
    #"c2fq1__7": "challenge_flood",
    "c2fq1__9": "challenge_animal_damage",
    "c2fq1__13": "sales_difficulties", #
    "c2fq1__19" : "poor_transport", #
    "c10q3m": "main_crop_planted", #
    
}

# Create new DataFrame with renamed columns only
new_df = df[list(rename_dict.keys())].rename(columns=rename_dict)


In [None]:
new_df.shape

In [None]:
new_df.info()

In [None]:
has_nan = []
for i in new_df.columns:
    if new_df[i].isna().sum() > 0:
        has_nan.append(i)
        
print(len(has_nan),'\n',has_nan)

In [None]:
for i in new_df.columns:
    
    print(f' {i} : {new_df[i].unique()} \n')

# Processing & Storage availabilty

In [None]:
new_df.to_csv('../Datasets/full_survey.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_proc = new_df[["state",'comid',"electricity_available", "agro_processing","milling_facility","feed_mill","corn_husker","cocoa_mill","palm_oil_mill",
     "rice_husker"]]

df_proc.head()

In [None]:
df_proc.isna().sum()

In [None]:
state_counts = (df_proc['state'].value_counts()/(len(df_proc))*100).reset_index()
state_counts.columns = ['state', 'count']

# Create treemap
fig = px.treemap(state_counts, 
                 path=['state'], 
                 values='count',
                 color='count',
                 color_continuous_scale='YlOrRd',
                 title='Distribution of States in the Data')

fig.show()




In [None]:
counts = new_df.groupby(['state', 'storage_facility_available']).size().unstack(fill_value=0)


counts['total'] = counts.sum(axis=1)
counts['yes_pct'] = (counts['Yes'] / counts['total']) * 100
counts['no_pct'] = (counts['No'] / counts['total']) * 100

counts = counts.reset_index()

fig = px.treemap(
    counts,
    path=['state'],
    values='no_pct',
    color='no_pct',
    color_continuous_scale='YlOrRd',
    title='Percentage of Farmers with no acess to storage by State'
)

fig.update_layout(
    margin=dict(t=50,l=50,r=0,b=0),
    autosize=False,
    width=800,
    height=600
)
fig.show()


In [None]:
counts[['state','yes_pct']].sort_values('yes_pct',ascending=False)

**Storage is a major issue seeing as even Sokoto which has the most farmers reporting yes to access has about 79% saying they dont have acess** 

In [None]:
counts = df_proc.groupby(['state', 'agro_processing']).size().unstack(fill_value=0)


counts['total'] = counts.sum(axis=1)
counts['yes_pct'] = (counts['Yes'] / counts['total']) * 100
counts['no_pct'] = (counts['No'] / counts['total']) * 100

counts = counts.reset_index()

fig = px.treemap(
    counts,
    path=['state'],
    values='no_pct',
    color='no_pct',
    color_continuous_scale='YlOrRd',
    title='Percentage of Farmers with no acess to AgroProcessing by State'
)

fig.update_layout(
    margin=dict(t=50,l=50,r=0,b=0),
    autosize=False,
    width=800,
    height=600
)
fig.show()


In [None]:
counts.sort_values('no_pct').reset_index()

In [None]:
# Sort to get top 5 in each
top_yes = counts.sort_values('yes_pct', ascending=False).head(5)
top_no = counts.sort_values('no_pct', ascending=False).head(5)

import plotly.graph_objects as go

fig = go.Figure()


fig.add_trace(go.Bar(
    x=top_no['state'],
    y=top_no['no_pct'],
    name='No Acess',
    marker_color='red'
))

fig.update_layout(
    barmode='group',
    title='Top 5 States with and without acess to Agro Processing facilities',
    xaxis_title='State',
    yaxis_title='Percentage'
)
fig.show()


**Note There's a drill down for the report allowing users select a state and see what facilities are available within the state.**

For now a national level overview of what facilities are lacking.

In [None]:
facilities = ['milling_facility', 'feed_mill', 'corn_husker', 'cocoa_mill', 'palm_oil_mill', 'rice_husker']

df_long = df_proc.melt(id_vars='state', value_vars=facilities, var_name='facility', value_name='response')

# Count 'No' responses and total responses per facility
national_no_pct = (
    df_long.groupby(['facility', 'response'])
    .size()
    .unstack(fill_value=0)
)

# Calculate percentage of 'No' responses
national_no_pct['no_pct'] = (national_no_pct['No'] / (national_no_pct['No'] + national_no_pct['Yes'])) * 100

# Reset index for plotting
national_no_pct = national_no_pct.reset_index()

fig_national = px.bar(
    national_no_pct,
    x='facility',
    y='no_pct',
    title='National-Level Average % of "No" Responses per Facility',
    labels={'no_pct': '% No Responses', 'facility': 'Facility'},
    color_discrete_sequence=['red']
)
fig_national.update_layout(xaxis_tickangle=-45)
fig_national.show()



In [None]:
state_no_pct = (
    df_long.groupby(['state', 'facility', 'response'])
    .size()
    .unstack(fill_value=0)
)

# Calculate % No per state-facility
state_no_pct['no_pct'] = (state_no_pct['No'] / (state_no_pct['No'] + state_no_pct['Yes'])) * 100

# Reset index for heatmap
state_no_pct = state_no_pct.reset_index()


In [None]:
import seaborn as sns
heatmap_data = state_no_pct.pivot(index='state', columns='facility', values='no_pct')

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, cmap='Reds', annot=True, fmt=".1f")
plt.title('% of "No" Responses by State and Facility')
plt.ylabel('State')
plt.xlabel('Facility')
plt.tight_layout()
plt.show()


In [None]:
df_proc['cocoa_mill'].value_counts()

## Disease and Animal damage

In [None]:
counts = new_df.groupby(['state', 'challenge_crop_diseases']).size().unstack(fill_value=0)


counts['total'] = counts.sum(axis=1)
counts['yes_pct'] = (counts['Yes'] / counts['total']) * 100
counts['no_pct'] = (counts['No'] / counts['total']) * 100

counts = counts.reset_index()

fig = px.treemap(
    counts,
    path=['state'],
    values='yes_pct',
    color='yes_pct',
    color_continuous_scale='YlOrRd',
    title='Percentage of Farmers with complaints about crop diseases by State'
)

fig.update_layout(
    margin=dict(t=50,l=50,r=0,b=0),
    autosize=False,
    width=800,
    height=600
)
fig.show()


In [None]:
counts.sort_values(by='yes_pct').reset_index()

In [None]:
counts = new_df.groupby(['state', 'challenge_animal_damage']).size().unstack(fill_value=0)


counts['total'] = counts.sum(axis=1)
counts['yes_pct'] = (counts['Yes'] / counts['total']) * 100
counts['no_pct'] = (counts['No'] / counts['total']) * 100

counts = counts.reset_index()

fig = px.treemap(
    counts,
    path=['state'],
    values='yes_pct',
    color='yes_pct',
    color_continuous_scale='YlOrRd',
    title='Percentage of Farmers with complaints about animal damage by State'
)

fig.update_layout(
    margin=dict(t=50,l=50,r=0,b=0),
    autosize=False,
    width=800,
    height=600
)
fig.show()


# Land quality 

In [None]:
subset = land_df
    
state='nigeria'
subset['land_degradation_pct'] = subset['land_degradation_pct'].replace('None (0%)','0%')

# Count land degradation categories
deg_counts = ((subset['land_degradation_pct'].value_counts()/subset['land_degradation_pct'].shape[0])*100)

# Convert to DataFrame for plotting
deg_df = deg_counts.reset_index()
deg_df.columns = ['land_degradation_pct', 'count']

    
fig = px.bar(
    deg_df,
    x='land_degradation_pct',
    y='count',
    title=f"Land Degradation Levels in {state.capitalize()}",
    labels={'land_degradation_pct': 'Degradation Level (%)', 'count': 'Number of Responses'},
    color='land_degradation_pct',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)
fig.update_layout(xaxis_categoryorder='category ascending')
fig.show()

In [None]:
land_feats = ['state','land_degradation_pct','abandoned_farm_pct','produce_transport_method']
land_df = new_df[land_feats]
land_df

In [None]:
degradation_order = ['None (0%)', '1%-25%', '26%-50%', '51%-75%', '76%-100%']
category_to_score = {cat: i for i, cat in enumerate(degradation_order)}

land_df["land_degradation_score"] = land_df["land_degradation_pct"].map(category_to_score).astype(int)
land_df["abandoned_farm_score"] = land_df["abandoned_farm_pct"].map(category_to_score).astype(int)

# Group by state to get mean score
state_scores = land_df.groupby("state")[["land_degradation_score", "abandoned_farm_score"]].mean().reset_index()


In [None]:
# Create a treemap instead of choropleth
fig = px.treemap(
    state_scores,
    path=['state'],
    values='land_degradation_score',
    color='land_degradation_score',
    color_continuous_scale='YlOrRd',
    title="Land Degradation Score by State (Treemap)",
    labels={'land_degradation_score': 'Degradation Score'}
)

fig.show()

In [None]:
subset = land_df
subset['land_degradation_pct'] = subset['land_degradation_pct'].replace('None (0%)','0%')

# Count land degradation categories
deg_counts = ((subset['land_degradation_pct'].value_counts()/subset['land_degradation_pct'].shape[0])*100).sort_index()

# Convert to DataFrame for plotting
deg_df = deg_counts.reset_index()
deg_df.columns = ['land_degradation_pct', 'count']

# Plot with Plotly
fig = px.bar(
    deg_df,
    x='land_degradation_pct',
    y='count',
    title=f"Land Degradation Levels in Nigeria",
    labels={'land_degradation_pct': 'Degradation Level (%)', 'count': 'Number of Responses'},
    color='land_degradation_pct',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)
fig.update_layout(xaxis_categoryorder='category ascending')
fig.show()


In [None]:
count = new_df["increase_land_degradation"].value_counts().sort_values(ascending=False).reset_index()
count['count'] = (count['count'] / count['count'].sum()) * 100

# Plot with Plotly
fig = px.bar(
    count,
    y = "increase_land_degradation",
    x = 'count',
    title=f"Land Degradation Increase in Nigeria",
    labels={'increase_land_degradation': ' ', 'count': 'Percentage of response'},
    color='increase_land_degradation',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)

fig.show()

In [None]:
import plotly.express as px

# Filter by state
state_choice = 'TARABA'
subset = land_df[land_df['state'] == state_choice]
subset['land_degradation_pct'] = subset['land_degradation_pct'].replace('None (0%)','0%')

# Count land degradation categories
deg_counts = subset['land_degradation_pct'].value_counts().sort_index()

# Convert to DataFrame for plotting
deg_df = deg_counts.reset_index()
deg_df.columns = ['land_degradation_pct', 'count']

# Plot with Plotly
fig = px.bar(
    deg_df,
    x='land_degradation_pct',
    y='count',
    title=f"Land Degradation Levels in {state_choice}",
    labels={'land_degradation_pct': 'Degradation Level (%)', 'count': 'Number of Responses'},
    color='land_degradation_pct',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)
fig.update_layout(xaxis_categoryorder='category ascending')
fig.show()


## Land abandonment

In [None]:
# Create a treemap instead of choropleth
fig = px.treemap(
    state_scores,
    path=['state'],
    values='abandoned_farm_score',
    color='abandoned_farm_score',
    color_continuous_scale='YlOrRd',
    title="Farm abandonment Score by State (Treemap)",
    labels={'land_degradation_score': 'Degradation Score'}
)

fig.show()

In [None]:

state_choice = 'BORNO'
subset = land_df[land_df['state'] == state_choice]

subset['abandoned_farm_score'] = subset['abandoned_farm_score'].replace('None (0%)','0%')

# Count land degradation categories
deg_counts = ((subset['abandoned_farm_score'].value_counts()/subset.shape[0])*100).sort_index()

# Convert to DataFrame for plotting
deg_df = deg_counts.reset_index()
deg_df.columns = ['abandoned_farm_score', 'count']

# Plot with Plotly
fig = px.bar(
    deg_df,
    x='abandoned_farm_score',
    y='count',
    title=f"Land Degradation Levels in Nigeria",
    labels={'abandoned_farm_score': 'Abandonment level (%)', 'count': 'Number of Responses'},
    color='abandoned_farm_score',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)
fig.update_layout(xaxis_categoryorder='category ascending')
fig.show()


# Transportation methods

In [None]:
counts = new_df[new_df['state']=='LAGOS']['produce_transport_method'].value_counts().sort_values(ascending=False).reset_index()
counts['count'] = (counts['count'] / counts['count'].sum())*100

counts

In [None]:
fig = px.bar(
    counts,
    y='produce_transport_method',
    x='count',
    orientation='h',
    title=f"Transport Methods Nigerian Farmers",
    labels={'produce_transport_method': 'Transport Method', 'count': 'Percentage of farmers who use the method'},
    color='produce_transport_method',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)
fig.show()


In [None]:
counts = new_df.groupby(['state', 'poor_transport']).size().unstack(fill_value=0)


counts['total'] = counts.sum(axis=1)
counts['yes_pct'] = (counts['Yes'] / counts['total']) * 100
counts['no_pct'] = (counts['No'] / counts['total']) * 100

counts = counts.reset_index()

fig = px.treemap(
    counts,
    path=['state'],
    values='yes_pct',
    color='yes_pct',
    color_continuous_scale='YlOrRd',
    title='Percentage of Farmers with complaints about poor transport by State'
)

fig.update_layout(
    margin=dict(t=50,l=50,r=0,b=0),
    autosize=False,
    width=800,
    height=600
)
fig.show()


In [None]:
counts.sort_values(by='yes_pct',ascending=False)

# Sales issues

In [None]:
counts = new_df.groupby(['state', 'sales_difficulties']).size().unstack(fill_value=0)


counts['total'] = counts.sum(axis=1)
counts['yes_pct'] = (counts['Yes'] / counts['total']) * 100
counts['no_pct'] = (counts['No'] / counts['total']) * 100

counts = counts.reset_index()

fig = px.treemap(
    counts,
    path=['state'],
    values='yes_pct',
    color='yes_pct',
    color_continuous_scale='YlOrRd',
    title='Percentage of Farmers with complaints about poor transport by State'
)

fig.update_layout(
    margin=dict(t=50,l=50,r=0,b=0),
    autosize=False,
    width=800,
    height=600
)
fig.show()


# Market Availability

In [None]:
market_df = pd.read_stata('../Datasets/Anonymized data/4_c2c_Market_Access_ano.dta')
rename_dict = {
    "state_id":"state",
    "c2c__id": "market_type",
    "c2cq1a":"market_location",
    "c2cq1b":"market_distance"
    
}

# Create new DataFrame with renamed columns only
market_df = market_df[list(rename_dict.keys())].rename(columns=rename_dict)
display(market_df)
market_df.to_csv("../Datasets/market_df.csv")

In [None]:
market_df['state'].value_counts()

In [None]:
market_issues = market_df[market_df['market_location'].isin(['In another LGA'])]
state_market_issue_pct = (
    (market_issues.groupby('state').size() / market_df.groupby('state').size())*100
).reset_index(name='pct_outside_market')

# Treemap (or choropleth later)
fig = px.treemap(
    state_market_issue_pct,
    path=['state'],
    values='pct_outside_market',
    color='pct_outside_market',
    color_continuous_scale='Reds',
    title='Percentage of Respondents with Market Outside LGA',
    labels={'pct_outside_market': 'Percent (%)'}
)
fig.update_traces(
    hovertemplate='<b>%{label}</b><br>Outside Market: %{value:.1f}%<extra></extra>'
)

fig.show()


In [None]:
market_issues = market_df[market_df['market_location'].isin(['In another state'])]
state_market_issue_pct = (
    (market_issues.groupby('state').size() / market_df.groupby('state').size())*100
).reset_index(name='pct_outside_market')

# Treemap (or choropleth later)
fig = px.treemap(
    state_market_issue_pct,
    path=['state'],
    values='pct_outside_market',
    color='pct_outside_market',
    color_continuous_scale='Reds',
    title='Percentage of Respondents with Market Outside State',
    labels={'pct_outside_market': 'Percent (%)'}
)
fig.update_traces(
    hovertemplate='<b>%{label}</b><br>Outside Market: %{value:.1f}%<extra></extra>'
)

fig.show()


In [None]:
counts = market_df['market_location'].value_counts().sort_values(ascending=False).reset_index()
counts['count'] = (counts['count'] / counts['count'].sum()) * 100

fig = px.bar(
    counts,
    y='market_location',
    x='count',
    orientation='h',
    title=f"Market Locations",
    labels={'market_location': 'Market Location', 'count': 'Percentage of markets located within this range'},
    color='market_location',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)
fig.show()

# Disasters

In [None]:
disaster_df = pd.read_stata('../Datasets/Anonymized data/7_c4b_Disaster_ano.dta')
rename_dict = {
    "state_id":"state",
    "c4b__id": "disaster",
    "c4bq1b":"disaster_frequency_3_years",
    "c4bq2__4":"abandoned_area",
    "c4bq3__2":"production_distruption_loss",
    "c4bq4":"loss_severity_economic",
    "c4bq5__1":"land_lost",
    "c4bq5__2":"Crop losses",
    "c4bq6":"severity_of_physical_loss"
}

# Create new DataFrame with renamed columns only
disaster_df = disaster_df[list(rename_dict.keys())].rename(columns=rename_dict)
display(disaster_df)

In [None]:
disaster_df['disaster_frequency_3_years'].mean()

In [None]:
counts = disaster_df.groupby(['disaster'])['disaster_frequency_3_years'].mean().sort_values().reset_index()

fig = px.bar(
    counts,
    y='disaster',
    x='disaster_frequency_3_years',
    orientation='h',
    title=f"Disasters ranked by frequency (3 years)",
    labels={'disaster_frequency_3_years': 'Disaster Frequency', 'disaster': 'Disaster'},
    color='disaster_frequency_3_years',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)
fig.show()


In [None]:
counts = disaster_df.groupby(['state'])['disaster_frequency_3_years'].mean().reset_index()

fig = px.treemap(
    counts,
    path=['state'],
    values='disaster_frequency_3_years',
    color='disaster_frequency_3_years',
    color_continuous_scale='YlOrRd',
    title='DIsaster Frequency by State (3 Years)'
)

fig.update_layout(
    margin=dict(t=50,l=50,r=0,b=0),
    autosize=False,
    width=800,
    height=600
)
fig.show()


In [None]:
abandon_pct = (
    disaster_df.groupby('disaster')['abandoned_area']
    .value_counts(normalize=True)
    .unstack()
    .get('Yes', pd.Series(dtype=float)) * 100
).sort_values()

fig = px.bar(
    abandon_pct,
    orientation='h',
    labels={'value': '% Fled', 'index': 'Disaster'},
    title='Disasters Ranked by % of Respondents Who Fled',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)

fig.show()

In [None]:
prod_loss_pct = (
    disaster_df.groupby('disaster')['production_distruption_loss']
    .value_counts(normalize=True)
    .unstack()
    .get('Yes', pd.Series(dtype=float)) * 100
).sort_values()

fig = px.bar(
    prod_loss_pct,
    orientation='h',
    labels={'value': '% With Loss', 'index': 'Disaster'},
    title='Disasters Ranked by % Reporting Economic Loss due to Production Distruption',
    color_discrete_sequence=px.colors.sequential.YlOrRd
)

fig.show()

In [None]:
# Count responses
severity_counts = (
    disaster_df
    .dropna(subset=['loss_severity_economic'])
    .groupby(['disaster', 'loss_severity_economic'])
    .size()
    .reset_index(name='count')
)

# Get total per disaster for % calc
total_per_disaster = (
    severity_counts.groupby('disaster')['count']
    .sum()
    .reset_index(name='total')
)

# Merge and compute percentage
severity_pct = severity_counts.merge(total_per_disaster, on='disaster')
severity_pct['percentage'] = (severity_pct['count'] / severity_pct['total']) * 100

# Plot

color_map = {
    'Small losses': 'yellow',
    'Significant losses': 'orange',
    'Almost total or total losses': 'red'
}


fig = px.bar(
    severity_pct,
    x='disaster',
    y='percentage',
    color='loss_severity_economic',
    barmode='group',
    title='Disasters by Economic Loss Severity (%)',
    labels={'percentage': 'Percentage'},
    color_discrete_map=color_map
)
fig.show()


In [None]:
severity_pct

In [None]:
crop_loss_disaster_pct = (
    disaster_df.groupby('disaster')['Crop losses']
    .value_counts(normalize=True)
    .unstack()
    .get('Yes', pd.Series(dtype=float)) * 100
).sort_values()

fig = px.bar(
    crop_loss_disaster_pct,
    orientation='h',
    labels={'value': '% Crop Loss', 'index': 'Disaster'},
    title='Disasters by % Crop Loss'
)

fig.show()

In [None]:
crop_loss_disaster_pct.reset_index()

In [None]:
crop_loss_pct = (
    disaster_df.groupby('state')['Crop losses']
    .value_counts(normalize=True)
    .unstack()
    .get('Yes', pd.Series(dtype=float)) * 100
).reset_index(name='pct_crop_loss')

fig = px.treemap(
    crop_loss_pct,
    path=['state'],
    values='pct_crop_loss',
    color='pct_crop_loss',
    color_continuous_scale='Reds',
    title='States by % Crop Loss Post-Disaster'
)
fig.show()