### 2.1) Import pandas module for working with tables of data

In [None]:
import pandas as pd

### 2.2) Load data from "fruit.csv"

In [None]:
fruit_original = pd.read_csv('./DATA/fruit.csv', index_col=0)

### 2.3) Inspect the data and check for missing valeus

In [None]:
print('\nData shape: ', fruit_original.shape)

print('\nFirst ten rows of data: \n', fruit_original.head(10))

print('\nNumber of missing values: \n', fruit_original.isnull().sum())

print('\nProportion of missing values: \n', fruit_original.isnull().sum()/fruit_original.shape[0])

print('\nStatistics: ', fruit_original.describe())

### 2.4) Try removing all instances with missing values, then replacing missing values with a constant

In [None]:
# Remove all rows containing missing values
fruit_without_NaN = fruit_original.dropna()
print('\nNew data shape: \n', fruit_without_NaN.shape)
print('\nProportion of data lost: \n', (fruit_original.shape[0]-fruit_without_NaN.shape[0])/fruit_original.shape[0])


# Replace missing values with 0
fruit_NaN_for_0 = fruit_original.fillna(0)
print('\nFirst ten rows of data: \n', fruit_NaN_for_0.head(10))

### 2.5) Encode "Fruit" column with numerical labels

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
fruit_fruit_encoded = fruit_original.copy()
fruit_fruit_encoded['Fruit'] = encoder.fit_transform(fruit_fruit_encoded['Fruit'])

print(fruit_fruit_encoded.head())
print('\nFruit categories: ', encoder.classes_)

### 2.6) Try using a simple imputer to imput missing values

In [None]:
from sklearn.impute import SimpleImputer

fruit_simply_imputed = fruit_fruit_encoded.copy()

for feature in ['Color', 'Weight', 'Diameter']:

    if fruit_simply_imputed[feature].dtype == 'O':

        imputer = SimpleImputer(strategy='most_frequent')

    else:

        imputer = SimpleImputer(strategy='mean')

    fruit_simply_imputed[feature] = imputer.fit_transform(fruit_fruit_encoded[feature].values.reshape(-1, 1))
    
print(fruit_simply_imputed.head())

### 2.7) Encode "Color" attribute

In [None]:
encoder = LabelEncoder()
fruit_color_encoded = fruit_simply_imputed.copy()
fruit_color_encoded['Color'] = encoder.fit_transform(fruit_color_encoded['Color'])

print(fruit_color_encoded.head())
print('\nColor categories: ', encoder.classes_)

### 2.8) Plot histograms and boxplots of data

In [None]:
hist = fruit_color_encoded.hist(layout=(1,4), figsize=(12,2.5))

In [None]:
box = fruit_color_encoded.boxplot(figsize=(12, 2.5))

### 2.9) Standardize data and re-plot histograms and box-plots

In [None]:
def standardize(data):
    
    return (data - data.mean())/data.std()


for feature in ['Color', 'Weight', 'Diameter']:
    
    fruit_color_encoded[feature] = standardize(fruit_color_encoded[feature])

In [None]:
hist = fruit_color_encoded.hist(layout=(1,4), figsize=(12,2.5))

In [None]:
box = fruit_color_encoded.boxplot(figsize=(12, 2.5))

### 2.10) View a scatter-plot of each fruit with weight vs diametet

In [None]:
import matplotlib.pyplot as plt

apples = fruit_color_encoded[fruit_color_encoded['Fruit'] == 0]
bananas = fruit_color_encoded[fruit_color_encoded['Fruit'] == 1]
oranges = fruit_color_encoded[fruit_color_encoded['Fruit'] == 2]

plt.figure()
plt.scatter(apples['Weight'], apples['Diameter'], color='green', label='Apple')
plt.scatter(bananas['Weight'], bananas['Diameter'], color='yellow', label='Banana')
plt.scatter(oranges['Weight'], oranges['Diameter'], color='orange', label='Orange')
plt.xlabel('Weight')
plt.ylabel('Diameter')
plt.legend()
plt.show()

### 2.11) Save encoded dataset

In [None]:
fruit_color_encoded.to_csv('./DATA/fruit_encoded.csv')