In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


data = pd.read_csv(r"C:\Users\Banesta\Downloads\archive\crop_yield.csv")

In [2]:
data

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.644190,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251
...,...,...,...,...,...,...,...,...,...,...
999995,West,Silt,Rice,302.805345,27.987428,False,False,Sunny,76,1.347586
999996,South,Chalky,Barley,932.991383,39.661039,True,False,Rainy,93,7.311594
999997,North,Peaty,Cotton,867.362046,24.370042,True,False,Cloudy,108,5.763182
999998,West,Silt,Wheat,492.812857,33.045505,False,False,Sunny,102,2.070159


In [3]:
print(data.isnull().sum())

Region                    0
Soil_Type                 0
Crop                      0
Rainfall_mm               0
Temperature_Celsius       0
Fertilizer_Used           0
Irrigation_Used           0
Weather_Condition         0
Days_to_Harvest           0
Yield_tons_per_hectare    0
dtype: int64


In [4]:
print(data.columns)

Index(['Region', 'Soil_Type', 'Crop', 'Rainfall_mm', 'Temperature_Celsius',
       'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition',
       'Days_to_Harvest', 'Yield_tons_per_hectare'],
      dtype='object')


In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Fill missing numerical values with the mean of the column
data['Rainfall_mm'].fillna(data['Rainfall_mm'].mean(), inplace=True)
data['Temperature_Celsius'].fillna(data['Temperature_Celsius'].mean(), inplace=True)
data['Yield_tons_per_hectare'].fillna(data['Yield_tons_per_hectare'].mean(), inplace=True)

# Fill missing categorical values with the mode (most frequent value)
data['Soil_Type'].fillna(data['Soil_Type'].mode()[0], inplace=True)
data['Weather_Condition'].fillna(data['Weather_Condition'].mode()[0], inplace=True)

In [7]:
print(data.isnull().sum())

Region                    0
Soil_Type                 0
Crop                      0
Rainfall_mm               0
Temperature_Celsius       0
Fertilizer_Used           0
Irrigation_Used           0
Weather_Condition         0
Days_to_Harvest           0
Yield_tons_per_hectare    0
dtype: int64


In [8]:
# Extract the 'Crop' column as a list
crop_list = data['Crop'].tolist()

print(crop_list[:5])  # Display first 5 crops

['Cotton', 'Rice', 'Barley', 'Soybean', 'Wheat']


In [9]:
# Extract the 'Region' column as a list
region_list = data['Region'].tolist()

print(region_list[:5])  # Display first 5 regions

['West', 'South', 'North', 'North', 'South']


In [10]:
# Create a dictionary mapping Region to average Yield
region_yield_dict = data.groupby('Region')['Yield_tons_per_hectare'].mean().to_dict()

print(region_yield_dict)  # Print the dictionary


{'East': 4.645594432991786, 'North': 4.654113791721815, 'South': 4.648843499024244, 'West': 4.649330523836857}


In [11]:
# Get unique crop types
unique_crops = set(data['Crop'])

print(unique_crops)  # Display unique crops

{'Barley', 'Soybean', 'Wheat', 'Rice', 'Cotton', 'Maize'}


In [12]:
# Get unique soil types
unique_soil_types = set(data['Soil_Type'])

print(unique_soil_types)  # Display unique soil types

{'Peaty', 'Sandy', 'Silt', 'Chalky', 'Clay', 'Loam'}


In [13]:
# Extract specific columns
selected_columns = data[['Region', 'Crop', 'Yield_tons_per_hectare']]
print(selected_columns.head())  # Display the first few rows of the selected columns


  Region     Crop  Yield_tons_per_hectare
0   West   Cotton                6.555816
1  South     Rice                8.527341
2  North   Barley                1.127443
3  North  Soybean                6.517573
4  South    Wheat                7.248251


In [14]:
# Extract rows from index 10 to 20
rows_10_to_20 = data.iloc[10:21]
print(rows_10_to_20)  # Display the selected rows

   Region Soil_Type    Crop  Rainfall_mm  Temperature_Celsius  \
10  North     Peaty   Wheat   385.135314            21.656192   
11   East     Sandy  Cotton   145.300681            19.755535   
12  South     Peaty  Cotton   607.150252            15.562163   
13   East      Clay  Barley   929.123735            29.677303   
14  North     Peaty  Barley   621.778388            26.843173   
15   East    Chalky    Rice   874.456744            27.256869   
16   East     Peaty   Wheat   787.084306            25.672920   
17  North      Clay  Cotton   416.898632            23.190810   
18  North     Sandy  Barley   977.259083            17.604100   
19  South      Clay   Maize   888.207630            39.945509   
20   East      Clay  Cotton   990.267439            24.072052   

    Fertilizer_Used  Irrigation_Used Weather_Condition  Days_to_Harvest  \
10            False            False             Sunny               73   
11             True             True            Cloudy              1

In [15]:
# Extract rows where the yield is greater than 5 tons per hectare
high_yield = data[data['Yield_tons_per_hectare'] > 5]
print(high_yield.head())  # Display the first few rows with high yield


  Region Soil_Type     Crop  Rainfall_mm  Temperature_Celsius  \
0   West     Sandy   Cotton   897.077239            27.676966   
1  South      Clay     Rice   992.673282            18.026142   
3  North     Sandy  Soybean   986.866331            16.644190   
4  South      Silt    Wheat   730.379174            31.620687   
5  South      Silt  Soybean   797.471182            37.704974   

   Fertilizer_Used  Irrigation_Used Weather_Condition  Days_to_Harvest  \
0            False             True            Cloudy              122   
1             True             True             Rainy              140   
3            False             True             Rainy              146   
4             True             True            Cloudy              110   
5            False             True             Rainy               74   

   Yield_tons_per_hectare  
0                6.555816  
1                8.527341  
3                6.517573  
4                7.248251  
5                5.89841

In [16]:
# Extract specific rows and columns using loc
# For instance, getting rows for 'South' region and specific columns
south_region_data = data.loc[data['Region'] == 'South', ['Crop', 'Yield_tons_per_hectare']]
print(south_region_data.head())  # Display first few rows for South region


       Crop  Yield_tons_per_hectare
1      Rice                8.527341
4     Wheat                7.248251
5   Soybean                5.898416
7      Rice                5.829542
12   Cotton                6.525186


In [17]:
# Get the first 5 rows and the first 3 columns
first_five_rows_columns = data.iloc[0:5, 0:3]
print(first_five_rows_columns)  # Display the first five rows and first three columns


  Region Soil_Type     Crop
0   West     Sandy   Cotton
1  South      Clay     Rice
2  North      Loam   Barley
3  North     Sandy  Soybean
4  South      Silt    Wheat


In [18]:
# Resetting index after filtering
reset_index_data = high_yield.reset_index(drop=True)
print(reset_index_data.head())  # Display the first few rows of the reset index


  Region Soil_Type     Crop  Rainfall_mm  Temperature_Celsius  \
0   West     Sandy   Cotton   897.077239            27.676966   
1  South      Clay     Rice   992.673282            18.026142   
2  North     Sandy  Soybean   986.866331            16.644190   
3  South      Silt    Wheat   730.379174            31.620687   
4  South      Silt  Soybean   797.471182            37.704974   

   Fertilizer_Used  Irrigation_Used Weather_Condition  Days_to_Harvest  \
0            False             True            Cloudy              122   
1             True             True             Rainy              140   
2            False             True             Rainy              146   
3             True             True            Cloudy              110   
4            False             True             Rainy               74   

   Yield_tons_per_hectare  
0                6.555816  
1                8.527341  
2                6.517573  
3                7.248251  
4                5.89841

In [19]:
# Display the numerical columns
numerical_cols = ['Rainfall_mm', 'Temperature_Celsius', 'Yield_tons_per_hectare']
print(data[numerical_cols].describe())  # Get summary statistics of numerical columns


          Rainfall_mm  Temperature_Celsius  Yield_tons_per_hectare
count  1000000.000000       1000000.000000          1000000.000000
mean       549.981901            27.504965                4.649472
std        259.851320             7.220608                1.696572
min        100.000896            15.000034               -1.147613
25%        324.891090            21.254502                3.417637
50%        550.124061            27.507365                4.651808
75%        774.738520            33.753267                5.879200
max        999.998098            39.999997                9.963372


In [20]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize the numerical columns
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Display the normalized data
print(data[numerical_cols].head())


   Rainfall_mm  Temperature_Celsius  Yield_tons_per_hectare
0     0.885643             0.507078                0.693316
1     0.991861             0.121045                0.870756
2     0.053330             0.591761                0.204757
3     0.985409             0.065766                0.689875
4     0.700422             0.664827                0.755636


In [21]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
standard_scaler = StandardScaler()

# Standardize the numerical columns
data[numerical_cols] = standard_scaler.fit_transform(data[numerical_cols])

# Display the standardized data
print(data[numerical_cols].head())


   Rainfall_mm  Temperature_Celsius  Yield_tons_per_hectare
0     1.335747             0.023821                1.123645
1     1.703634            -1.312747                2.285709
2    -1.546977             0.317020               -2.075968
3     1.681287            -1.504137                1.101103
4     0.694233             0.569997                1.531783


In [22]:
# Check the summary statistics after normalization or standardization
print(data[numerical_cols].describe())


        Rainfall_mm  Temperature_Celsius  Yield_tons_per_hectare
count  1.000000e+06         1.000000e+06            1.000000e+06
mean   1.408296e-16        -2.939657e-16            1.388731e-15
std    1.000001e+00         1.000001e+00            1.000001e+00
min   -1.731687e+00        -1.731840e+00           -3.416941e+00
25%   -8.662297e-01        -8.656427e-01           -7.260731e-01
50%    5.470826e-04         3.324143e-04            1.376680e-03
75%    8.649436e-01         8.653434e-01            7.248310e-01
max    1.731823e+00         1.730469e+00            3.132140e+00
