In [None]:
import pandas as pa
import spacy
from fuzzywuzzy import process



In [8]:
from mappings import *

In [9]:
open_f = "datasets"
save_f = "transformed_data"

In [None]:
init_soil_data = pa.read_csv(f"{open_f}/soil.csv")
init_irrigated_area_data = pa.read_csv(f"{open_f}/irrigated_area.csv")
init_price_data = pa.read_csv(f"{open_f}/crop_prices.csv")
init_weather_data = pa.read_csv(f"{open_f}/weather.csv")

# Irrigated Area

In [4]:
def transform_irrigated_area_data(df):
    df_2020 = df[df['Year'] == 2020]
    df_2020['Address'] = df_2020['State Code'].astype(str) + ', ' + df_2020['State Name'] + ', ' + df_2020['Dist Name']

    # df_2020 = df_2020[['Dddist Code', 'Address'] + [col for col in df.columns if 'IRRIGATED AREA' in col]]
    df_2020['Total Area'] = df_2020[[col for col in df_2020.columns if 'IRRIGATED AREA' in col]].sum(axis=1)

    df_final = df_2020[['Dist Code', 'Address', 'Total Area']]
    df_final.to_csv(f'{save_f}/irrigated_areacsv', index=False)
    print("Transformation for irrigated area complete.")

# Soil nutrients

In [5]:
def transform_soil_nutrient_levels(df):
    macronutrient_categories = ["Nitrogen", "Phosphorous", "Potassium", "OC", "EC", "pH"]
    micronutrient_categories = ["Copper", "Boron", "S", "Fe", "Zn", "Mn"]
    
    combined_results = {}

    for nutrient in macronutrient_categories:
        columns = [col for col in df.columns if col.startswith(nutrient)]
        def determine_macronutrient_level(row):
            values = row[columns]
            max_index = values.idxmax()
            return max_index.split(" - ")[1]
        combined_results[nutrient] = df.apply(determine_macronutrient_level, axis=1)

    for nutrient in micronutrient_categories:
        columns = [col for col in df.columns if col.startswith(nutrient)]
        def determine_micronutrient_level(row):
            values = row[columns]
            max_index = values.idxmax()
            return max_index.split(" - ")[1]
        combined_results[nutrient] = df.apply(determine_micronutrient_level, axis=1)

    combined_df = pa.DataFrame(combined_results)

    other_columns = ['District'] 
    df_final = pa.concat([df[other_columns], combined_df], axis=1)

    df_final.to_csv(f'{save_f}/soil.csv', index=False)
    print("Transformation for Soil Data complete.")

# Prices

In [13]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
nlp = spacy.load("en_core_web_sm")

standard_commodities = [
    'RICE', 'WHEAT', 'KHARIF SORGHUM', 'RABI SORGHUM', 'SORGHUM', 
    'PEARL MILLET', 'MAIZE', 'FINGER MILLET', 'BARLEY', 'CHICKPEA', 
    'PIGEONPEA', 'MINOR PULSES', 'PULSES', 'GROUNDNUT', 'SESAMUM', 
    'LINSEED', 'SUGARCANE', 'COTTON', 'FRUITS AND VEGETABLES', 'FODDER'
]

def match_commodity_entity(commodity, variety):
    combined_text = f"{commodity} {variety}"
    doc = nlp(combined_text)
    entities = [ent.text.upper() for ent in doc.ents]
    if entities:
        best_match = process.extractOne(entities[0], standard_commodities)
        return best_match[0]
    
    return process.extractOne(combined_text.upper(), standard_commodities)[0]

def transform_crop_price(df):
    df = df.drop(columns=['Market', 'Grade', 'Arrival_Date', 'Modal_x0020_Price'])
    df['Commodity'] = df.apply(lambda row: match_commodity_entity(row['Commodity'], row['Variety']), axis=1)
    return df

# TRANFORMM!!!!