In [2]:

# 1. Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [3]:
# 2. Load Datasets
location_df = pd.read_csv(r"C:\Users\Atharv\Documents\Capstone Project\location_table_maharashtra_full_updated.csv")
crop_df = pd.read_csv(r"C:\Users\Atharv\Documents\Capstone Project\crop_master_fullset.csv")


In [4]:
# 3. Inspect the Data
print("Location Data Shape:", location_df.shape)
print(location_df.head())
print("\nCrop Data Shape:", crop_df.shape)
print(crop_df.head())


Location Data Shape: (5143, 3)
        State    District                  City
0   Rajasthan  Ganganagar                 1 Sgm
1   Rajasthan  Ganganagar                 3 Str
2  Tamil Nadu  Coimbatore  A.Thirumuruganpoondi
3  Tamil Nadu     Madurai        A.Vellalapatti
4     Gujarat   Porbandar             Aadityana

Crop Data Shape: (105, 8)
  Crop_Name  Crop_Type Ideal_Soil Sowing_Season  Temperature_Min_C  \
0    Potato  Vegetable      Loamy     June–July                 18   
1     Onion  Vegetable      Loamy     June–July                 18   
2    Tomato  Vegetable      Loamy     June–July                 18   
3  Eggplant  Vegetable      Loamy     June–July                 18   
4      Okra  Vegetable      Loamy     June–July                 18   

   Temperature_Max_C  Harvest_Days                          Growth_Stages  
0                 30            90  Seedling–Vegetative–Flowering–Harvest  
1                 30            90  Seedling–Vegetative–Flowering–Harvest  
2    

In [5]:
# 4. Check for Missing Values and Drop if Found
print("\nMissing in Location:", location_df.isnull().sum())
print("Missing in Crop:", crop_df.isnull().sum())
location_df.dropna(inplace=True)
crop_df.dropna(inplace=True)



Missing in Location: State       0
District    0
City        0
dtype: int64
Missing in Crop: Crop_Name            0
Crop_Type            0
Ideal_Soil           0
Sowing_Season        0
Temperature_Min_C    0
Temperature_Max_C    0
Harvest_Days         0
Growth_Stages        0
dtype: int64


In [6]:
# 5. Remove Duplicates
location_df.drop_duplicates(inplace=True)
crop_df.drop_duplicates(inplace=True)


In [7]:
print(crop_df.columns)

Index(['Crop_Name', 'Crop_Type', 'Ideal_Soil', 'Sowing_Season',
       'Temperature_Min_C', 'Temperature_Max_C', 'Harvest_Days',
       'Growth_Stages'],
      dtype='object')


In [8]:
# 6. Clean & Standardize Text Columns

location_df['State'] = location_df['State'].str.strip().str.title()
location_df['District'] = location_df['District'].str.strip().str.title()
location_df['City'] = location_df['City'].str.strip().str.title()

crop_df['Crop_Name'] = crop_df['Crop_Name'].str.strip().str.title()
crop_df['Crop_Type'] = crop_df['Crop_Type'].str.strip().str.title()
crop_df['Sowing_Season'] = crop_df['Sowing_Season'].str.strip().str.title()
crop_df['Ideal_Soil'] = crop_df['Ideal_Soil'].str.strip().str.capitalize() 


In [9]:
# 7. (Optional) Encode Text Columns for ML
le = LabelEncoder()
location_df['State_Encoded'] = le.fit_transform(location_df['State'])
crop_df['Crop_Encoded'] = le.fit_transform(crop_df['Crop_Name'])


In [10]:

# 8. Check Data Types
print("\nLocation Data Types:\n", location_df.dtypes)
print("\nCrop Data Types:\n", crop_df.dtypes)



Location Data Types:
 State            object
District         object
City             object
State_Encoded     int64
dtype: object

Crop Data Types:
 Crop_Name            object
Crop_Type            object
Ideal_Soil           object
Sowing_Season        object
Temperature_Min_C     int64
Temperature_Max_C     int64
Harvest_Days          int64
Growth_Stages        object
Crop_Encoded          int64
dtype: object


In [11]:
# 9. Save Cleaned Files (Optional)
location_df.to_csv("cleaned_location_table.csv", index=False)
crop_df.to_csv("cleaned_crop_table.csv", index=False)
