# Preprocessing

Importing the required libraries

In [9]:
import numpy as np
import pandas as pd

The first step of preprocessing: **Data Cleaning**. Missing values handled by ignoring some tuples and using mean to fill in the remaining

In [10]:
df = pd.read_csv("..\\data\\0_Final_Merged.csv")

# All tuples that have 0 in the production column are removed
df = df[np.isfinite(df['Production'])]

# All nan values filled with 0
df = df.fillna(0)

states = list(df.State_Name.unique())
years = list(df.Crop_Year.unique())
months = list(df.Season.unique())
for month in months:
	df.loc[df['Season'] == month, 'Season'] = month.strip() # Removing trailing spaces
months = list(df.Season.unique())

# Filling nan values of rainfall with mean value for a particular state, year and month
for state in states:
	for year in years:
		for month in months:
			sm = 0
			co = 0
			new_df = df[(df['State_Name'] == state) & (df['Crop_Year'] == year) & (df['Season'] == month)]
			for column in new_df['Rainfall']:
				if column != 0:
					sm += column
					co -= -1
			av = 0
			if co != 0: av = sm / co
			df.loc[((df['State_Name'] == state) & (df['Crop_Year'] == year) & (df['Season'] == month) & (df['Rainfall'] == 0)), 'Rainfall'] = av

count = 0
summ = 0
for i in df['Rainfall']:
	if i != 0:
		summ += i
		count -= -1

avg = summ / count

df.loc[(df['Rainfall'] == 0), 'Rainfall'] = avg

# Storing the output of the first step of data preprocessing to csv file
df.to_csv('..\\data\\1_Preprocess_Data_Cleaning.csv', index=False)

KeyboardInterrupt: 

**Data Reduction**: Stratified Sampling used for Numerosity Reduction

In [3]:
df = pd.read_csv("..\\data\\1_Preprocess_Data_Cleaning.csv")

# Storing distinct column names
column = list(df)

# Creating a new dataframe that will be the output dataframe after nuerosity reduction
my_df = pd.DataFrame(columns = column)

# Storing unique Crop names
crops = list(df.Crop.unique())

# Stratified Sampling based on distinct crop names
for crop in crops:
	new_df = df[(df['Crop'] == crop)]
	new_df = new_df.sample(frac = 0.7)
	my_df = my_df.append(new_df, ignore_index = True)

# Shuffling the dataframe to ensure randomness
my_df = my_df.sample(frac = 1)

# Storing the output in csv file
my_df.to_csv('..\\data\\2_Preprocess_Numerosity_Reduction.csv', index=False)

**Feature Construction**

In [4]:
df = pd.read_csv("..\\data\\2_Preprocess_Numerosity_Reduction.csv")

productions = []
area = []

for i in df['Production']: productions.append(i)

for i in df['Area']: area.append(i)

# Creating a new feature that will be used for all the further analysis
yields = []

for i in range(169655):
	yields.append(productions[i] / area[i])

df['Yield'] = yields

# Storing the output in csv file
df.to_csv('..\\data\\3_Preprocess_Feature_Construction.csv', index=False)

**Normalization**: Z-Score normalisation used to normalise 'Rainfall' and 'Yield'

In [5]:
df = pd.read_csv("..\\data\\3_Preprocess_Feature_Construction.csv")

# Normalizing Rainfall using Z-Score
rainfall_mean = df['Rainfall'].mean()
rainfall_std = df['Rainfall'].std(ddof = 0)
rainfall = []
for i in df['Rainfall']: rainfall.append(i)
for i in range(169655): rainfall[i] = (rainfall[i] - rainfall_mean) / rainfall_std
df['Rainfall_ZScore'] = rainfall

# Normalizing Yield using Z-Score
yield_mean = df['Yield'].mean()
yield_std = df['Yield'].std(ddof = 0)
yields = []
for i in df['Yield']: yields.append(i)
for i in range(169655): yields[i] = (yields[i] - yield_mean) / yield_std
df['Yield_ZScore'] = yields

# Storing the output in csv file
df.to_csv('..\\data\\4_Preprocess_Normalization_Z_Score.csv', index = False)

**Discretization**: 'Rainfall' and 'Yield' are discretized into five categories - _very low, low, medium, high, very_high_

In [6]:
df = pd.read_csv("..\\data\\4_Preprocess_Normalization_Z_Score.csv")

# Different categories for discretizing
classes = ['Very_Low', 'Low', 'Medium', 'High', 'Very_High']

# Discretizing Rainfall
rainfall_z = []
for i in df['Rainfall_ZScore']: rainfall_z.append(i)
minrain = min(rainfall_z)
maxrain = max(rainfall_z)
count = [0] * 5
for i in range(169655):
	if rainfall_z[i] < -1.22:
		temp = 0
	elif rainfall_z[i] < -0.5:
		temp = 1
	elif rainfall_z[i] < 0.5:
		temp = 2
	elif rainfall_z[i] < 2.5:
		temp = 3
	else:
		temp = 4
	count[temp] -= -1
	rainfall_z[i] = classes[temp]

# Discretizing Yield
yield_z = []
for i in df['Yield_ZScore']: yield_z.append(i)
minyield = min(yield_z)
maxyield = max(yield_z)
delta = (-0.047 - minyield) / 5
count = [0] * 5
for i in range(169655):
	temp = int((yield_z[i] - minyield) // delta)
	if temp > 4: temp = 4
	count[temp] -= -1
	yield_z[i] = classes[temp]

df['Rainfall_Disc'] = rainfall_z
df['Yield_Disc'] = yield_z

# Storing the output in csv file
df.to_csv('..\\data\\5_Preprocess_Discretization.csv', index = False)

**Binarization**: Discretized 'Rainfall' and 'Yield' are mapped to five binary variables corresponding to five categories. A binary variable represents the presence or absence of a record in that category

In [7]:
import pandas as pd

df = pd.read_csv('..\\data\\5_Preprocess_Discretization.csv')

rainfall = []

# Each bit represents a category for rainfall
for i in df['Rainfall_Disc']:
	if i == 'Very_Low': rainfall.append([1, 0, 0, 0, 0])
	elif i == 'Low': rainfall.append([0, 1, 0, 0, 0])
	elif i == 'Medium': rainfall.append([0, 0, 1, 0, 0])
	elif i == 'High': rainfall.append([0, 0, 0, 1, 0])
	else: rainfall.append([0, 0, 0, 0, 1])

# Five categories of rainfall
rainfall_vl = []
rainfall_l = []
rainfall_m = []
rainfall_h = []
rainfall_vh = []

for i in range(169655):
	rainfall_vl.append(rainfall[i][0])
	rainfall_l.append(rainfall[i][1])
	rainfall_m.append(rainfall[i][2])
	rainfall_h.append(rainfall[i][3])
	rainfall_vh.append(rainfall[i][4])

df['Rainfall_Very_Low'] = rainfall_vl
df['Rainfall_Low'] = rainfall_l
df['Rainfall_Medium'] = rainfall_m
df['Rainfall_High'] = rainfall_h
df['Rainfall_Very_High'] = rainfall_vh




yields = []

# Each bit represents a category for yield
for i in df['Yield_Disc']:
	if i == 'Very_Low': yields.append([1, 0, 0, 0, 0])
	elif i == 'Low': yields.append([0, 1, 0, 0, 0])
	elif i == 'Medium': yields.append([0, 0, 1, 0, 0])
	elif i == 'High': yields.append([0, 0, 0, 1, 0])
	else: yields.append([0, 0, 0, 0, 1])

# Five categories of yield
yield_vl = []
yield_l = []
yield_m = []
yield_h = []
yield_vh = []

for i in range(169655):
	yield_vl.append(yields[i][0])
	yield_l.append(yields[i][1])
	yield_m.append(yields[i][2])
	yield_h.append(yields[i][3])
	yield_vh.append(yields[i][4])

df['Yield_Very_Low'] = yield_vl
df['Yield_Low'] = yield_l
df['Yield_Medium'] = yield_m
df['Yield_High'] = yield_h
df['Yield_Very_High'] = yield_vh

# Storing the output in csv file
df.to_csv('..\\data\\6_Preprocess_Binarization.csv', index = False)

**Final Preprocessed Data**: All the columns that are not important for our further analysis (Association Rule Mining, Clustering and Classification) are now removed.

In [8]:
df = df.drop(['State_Name', 'District_Name', 'Crop_Year', 'Area', 'Production', 'Rainfall', 'Yield', 'Rainfall_ZScore', 'Yield_ZScore'], axis = 1)
df.to_csv('..\\data\\7_Preprocess_Final.csv', index = False)