In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px 

In [None]:
df = pd.read_csv('crop_production.csv',na_filter=True)
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [None]:
df

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0
...,...,...,...,...,...,...,...
246086,West Bengal,PURULIA,2014,Summer,Rice,306.0,801.0
246087,West Bengal,PURULIA,2014,Summer,Sesamum,627.0,463.0
246088,West Bengal,PURULIA,2014,Whole Year,Sugarcane,324.0,16250.0
246089,West Bengal,PURULIA,2014,Winter,Rice,279151.0,597899.0


In [None]:
print(df.nunique)
df.isna().sum()   #Checking na values

<bound method DataFrame.nunique of                          State_Name District_Name  ...      Area Production
0       Andaman and Nicobar Islands      NICOBARS  ...    1254.0     2000.0
1       Andaman and Nicobar Islands      NICOBARS  ...       2.0        1.0
2       Andaman and Nicobar Islands      NICOBARS  ...     102.0      321.0
3       Andaman and Nicobar Islands      NICOBARS  ...     176.0      641.0
4       Andaman and Nicobar Islands      NICOBARS  ...     720.0      165.0
...                             ...           ...  ...       ...        ...
246086                  West Bengal       PURULIA  ...     306.0      801.0
246087                  West Bengal       PURULIA  ...     627.0      463.0
246088                  West Bengal       PURULIA  ...     324.0    16250.0
246089                  West Bengal       PURULIA  ...  279151.0   597899.0
246090                  West Bengal       PURULIA  ...     175.0       88.0

[246091 rows x 7 columns]>


State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [None]:
#Replacing missing values with median
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
for col in numeric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    # impute values only for columns that have missing values
    if num_missing > 0: 
        med = df[col].median() #impute with the median
        df[col] = df[col].fillna(med)
df.isna().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [None]:
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  object 
 1   District_Name  246091 non-null  object 
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  object 
 4   Crop           246091 non-null  object 
 5   Area           246091 non-null  float64
 6   Production     246091 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.1+ MB


In [None]:
px.box(df,y="Production")

In [None]:
#Removing few outliers
df = df.loc[df.Production<1001000000]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246085 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246085 non-null  object 
 1   District_Name  246085 non-null  object 
 2   Crop_Year      246085 non-null  int64  
 3   Season         246085 non-null  object 
 4   Crop           246085 non-null  object 
 5   Area           246085 non-null  float64
 6   Production     246085 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 15.0+ MB


In [None]:
#Dropping duplicates
cols_other_than_id = list(df.columns)[1:]
df.drop_duplicates(subset=cols_other_than_id, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246076 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246076 non-null  int64  
 1   District_Name  246076 non-null  int64  
 2   Crop_Year      246076 non-null  int64  
 3   Season         246076 non-null  int64  
 4   Crop           246076 non-null  int64  
 5   Area           246076 non-null  float64
 6   Production     246076 non-null  float64
dtypes: float64(2), int64(5)
memory usage: 15.0 MB




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df.dtypes

State_Name         int64
District_Name      int64
Crop_Year          int64
Season             int64
Crop               int64
Area             float64
Production       float64
dtype: object

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['State_Name']= label_encoder.fit_transform(df['State_Name'])
df['District_Name']= label_encoder.fit_transform(df['District_Name'])
df['Season']= label_encoder.fit_transform(df['Season'])
df['Crop']= label_encoder.fit_transform(df['Crop'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [None]:
df

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,0,427,2000,1,2,1254.0,2000.0
1,0,427,2000,1,74,2.0,1.0
2,0,427,2000,1,95,102.0,321.0
3,0,427,2000,4,7,176.0,641.0
4,0,427,2000,4,22,720.0,165.0
...,...,...,...,...,...,...,...
246086,32,471,2014,3,95,306.0,801.0
246087,32,471,2014,3,102,627.0,463.0
246088,32,471,2014,4,106,324.0,16250.0
246089,32,471,2014,5,95,279151.0,597899.0


In [None]:
df.to_csv('crop_prod_clean.csv')

In [None]:
from google.colab import files
files.download('crop_prod_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>