In [17]:
import pandas as pd
import spacy
import os
import sqlite3
from fuzzywuzzy import process
import matplotlib.pyplot as plt

In [2]:
from mappings import *

In [3]:
iacp = "database/Irrigated_Area_and_Crop_Price.db"
sh = "database/soil_health.db"
wb = "database/weather_data.db"

tiacp = "Transformed_database/Irrigated_Area_and_Crop_Price.db"
tsh = "Transformed_database/soil_health.db"
twb = "Transformed_database/weather_data.db"

In [18]:
def load_db(path, name):
    with sqlite3.connect(path) as conn:
        db = pd.read_sql_query(f"SELECT * FROM {name}", conn)
    return db

def save_db(path, name, df):
    if not os.path.exists(path):
        open(path, 'w').close()
    with sqlite3.connect(path) as conn:
        df.to_sql(name, conn, if_exists='replace', index=False)

In [5]:
micro_soil_data = load_db(sh, "micro_nutrients")
macro_soil_data = load_db(sh, "macro_nutrients")
init_irrigated_area_data = load_db(iacp, "irrigated_area")
init_price_data = load_db(iacp, "crop_prices")
current_weather_data = load_db(wb, "current_weather")
daily_weather_data = load_db(wb, "daily_weather")
houry_weather_data = load_db(wb, "hourly_weather")

# Weather

In [20]:
def transform_weather_data(current_data, hourly_data, daily_data):
    daily_data['temperature_avg'] = (daily_data['temperature_2m_max'] + daily_data['temperature_2m_min']) / 2
    daily_data['temperature_range'] = daily_data['temperature_2m_max'] - daily_data['temperature_2m_min']

    hourly_data['date'] = pd.to_datetime(hourly_data['timestamp']).dt.date
    hourly_temp_avg = hourly_data.groupby(['location_id', 'date'])['temperature_2m'].mean().reset_index()
    hourly_temp_avg.rename(columns={'temperature_2m': 'hourly_temperature_avg'}, inplace=True)

    combined_data = pd.merge(daily_data, hourly_temp_avg, on=['location_id', 'date'], how='left')

    current_data['timestamp'] = pd.to_datetime(current_data['timestamp'])
    current_summary = current_data.groupby('location_id').agg(
        latest_temperature=('temperature_2m', 'last'),
        latest_humidity=('relative_humidity_2m', 'last'),
        latest_wind_speed=('wind_speed_10m', 'last')
    ).reset_index()

    final_data = pd.merge(combined_data, current_summary, on='location_id', how='left')
    save_db(twb, "Weather_data", final_data)

    print("Tranformation of Weather Data Completed.")

def plot_temperature_trends(daily_data, location_id, save_path="temperature_plot.png"):
    location_data = daily_data[daily_data['location_id'] == location_id]
    location_data['date'] = pd.to_datetime(location_data['date'])

    plt.figure(figsize=(12, 6))
    plt.plot(location_data['date'], location_data['temperature_2m_max'], label='Max Temp (°C)', color='red', marker='o')
    plt.plot(location_data['date'], location_data['temperature_2m_min'], label='Min Temp (°C)', color='blue', marker='o')
    plt.fill_between(location_data['date'], 
                     location_data['temperature_2m_min'], 
                     location_data['temperature_2m_max'], 
                     color='orange', alpha=0.3, label='Temperature Range')
    plt.title(f"Temperature Trends for Location ID: {location_id}")
    plt.xlabel("Date")
    plt.ylabel("Temperature (°C)")
    plt.legend()
    plt.grid(True)
    plt.savefig(save_path)
    print(f"Temperature plot saved as {save_path}")
    plt.close()

# Irrigated Area

In [21]:
def transform_irrigated_area_data(df):
    df_2020 = df[df['Year'] == 2020]
    df_2020['Address'] = str(df_2020['Dist_Name']) + ', ' + str(df_2020['State_Name']) + ', ' + str(df_2020['State_Code'])

    # df_2020 = df_2020[['Dddist Code', 'Address'] + [col for col in df.columns if 'IRRIGATED AREA' in col]]
    df_2020['Total_Area'] = df_2020[[col for col in df_2020.columns if 'IRRIGATED_AREA' in col]].sum(axis=1)

    df_final = df_2020[['Dist_Code', 'Address', 'Total_Area']]

    save_db(tiacp, "irriagated_area", df_final)
    print("Transformation for irrigated area complete.")

# Soil nutrients

In [22]:
def transform_soil_nutrient_levels(macro_df, micro_df):
    macronutrient_categories = ["Nitrogen", "Phosphorous", "Potassium", "OC", "EC", "pH"]
    micronutrient_categories = ["Copper", "Boron", "S", "Fe", "Zn", "Mn"]
    
    macro_results = {}
    micro_results = {}

    for nutrient in macronutrient_categories:
        columns = [col for col in macro_df.columns if col.startswith(nutrient)]
        def determine_macronutrient_level(row):
            values = row[columns]
            max_index = values.idxmax()
            return max_index.split(" - ")[1]
        macro_results[nutrient] = macro_df.apply(determine_macronutrient_level, axis=1)

    for nutrient in micronutrient_categories:
        columns = [col for col in micro_df.columns if col.startswith(nutrient)]
        def determine_micronutrient_level(row):
            values = row[columns]
            max_index = values.idxmax()
            return max_index.split(" - ")[1]
        micro_results[nutrient] = micro_df.apply(determine_micronutrient_level, axis=1)

    macro_df_tran = pd.DataFrame(macro_results)
    micro_df_tran = pd.DataFrame(micro_results)

    other_columns = ['District'] 
    macro_df_final = pd.concat([macro_df[other_columns], macro_df_tran], axis=1)
    micro_df_final = pd.concat([micro_df[other_columns], micro_df_tran], axis=1)

    save_db(tsh, "Macro_nutrients", macro_df_final)
    save_db(tsh, "Micro_nutrients", micro_df_final)
    print("Transformation for Soil Data complete.")



# Prices

In [13]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [23]:
nlp = spacy.load("en_core_web_sm")

standard_commodities = [
    'RICE', 'WHEAT', 'KHARIF SORGHUM', 'RABI SORGHUM', 'SORGHUM', 
    'PEARL MILLET', 'MAIZE', 'FINGER MILLET', 'BARLEY', 'CHICKPEA', 
    'PIGEONPEA', 'MINOR PULSES', 'PULSES', 'GROUNDNUT', 'SESAMUM', 
    'LINSEED', 'SUGARCANE', 'COTTON', 'FRUITS AND VEGETABLES', 'FODDER'
]

def match_commodity_entity(commodity, variety):
    combined_text = f"{commodity} {variety}"
    doc = nlp(combined_text)
    entities = [ent.text.upper() for ent in doc.ents]
    if entities:
        best_match = process.extractOne(entities[0], standard_commodities)
        return best_match[0]
    
    return process.extractOne(combined_text.upper(), standard_commodities)[0]

def transform_crop_price(df):
    df = df.drop(columns=['Market', 'Grade', 'Arrival_Date', 'Modal_Price'])
    df['Commodity'] = df.apply(lambda row: match_commodity_entity(row['Commodity'], row['Variety']), axis=1)
    save_db(tiacp, "crop_prices", df)
    print("Transformation for Soil Data complete.")

# TRANFORMM!!!!

In [25]:
transform_weather_data(current_weather_data, houry_weather_data, daily_weather_data)
plot_temperature_trends(daily_weather_data, location_id=1, save_path="temperature_plot.png")
transform_crop_price(init_price_data)

Tranformation of Weather Data Completed.
Temperature plot saved as temperature_plot.png
Transformation for Soil Data complete.


In [26]:
transform_soil_nutrient_levels(macro_soil_data, micro_soil_data)

ValueError: attempt to get argmax of an empty sequence

In [None]:
transform_irrigated_area_data(init_irrigated_area_data)