# GA-ANN house price - Octiba Nima Group


This notebook is for running the main function and visualizing the result for presentation purpose. General description for the main function is available below. More detail descriptios/comments  available in the .py files.

In [None]:
from runGA import run
import pandas as pd
import numpy as np
from data_module.data import data_cleaning
from ga_module.ga import GA
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import seaborn as sns
from matplotlib.legend_handler import HandlerLine2D
import time

### Run the Data Cleaning and calling function run from runGA
- Calling the function data_cleaning from data.py will return a dataframe which has been cleaned.
    - Input Argument : file name, and normalize = True / False
    - Output : if normalize set to True, then return the normalized cleaned Dataframe, otherwise, the cleaned dataframe without normalization.
- Calling function run from runGA.py
    - input Argument : 
         - **data :** cleaned dataframe
         - **target :** target column name, in this case 'SalePrice'
         - **init_ratio_ :** the inital probability of activating a feature in the initial population. 1 would activate all features, 0.5 half, etc...
         - **cross_rate :** cross ratio
         - **mutate_rate :** mutation ratio
         - **pop_size :** Population size
         - **n_generations :** number of generations
         - **elitism :** this will be used when the ga_evolve is set to 'elitism' this is a percentage of best individual in mating pool which will not be crossovered (calculated againt the number of population)
         - **ga_ann_iterations :** number of iterations, for ANN regressor which will be called within the GA
         - **ga_ann_layers :** number of hidden layers, for ANN regressor which will be called within the GA
         - **mape_ann_iterations :** number of iterations, for ANN regressor outside of the GA
         - **mape_ann_layers :** number of hidden layers, for ANN regressor outside the GA
         - **ga_score :** 'default' or 'score'(the method to be used for calculating fitness score)
         - **ga_evolve :** 'default','evolve2','elitism'(to determine which alternative of evolve function used)
         - **final_mape_idx :** 'default' or 'best')
         
    - output : 
         - **evolution :** Best Mean Squared Error (MSE) from each generation
         - **bestFeatures :** List of features from best individual from each generation(best individual ranked by the fitness score)
         - **bestPredictions :** List of predictions from the best individual from each generation
         - **initial_mape :** Mean absolute percentage error (MAPE) without the Feature Selection process (ANN only)
         - **final_mape :** Mean absolute percentage error (MAPE) with the Feature Selection process (GA_ANN)
         - **mape_y_test :** y_test values
         - **final_mape_prediction :** the predictions for SalePrice from GA_ANN (with features selection)
         - **initial_mape_prediction:** the predictions for SalePrice from ANN onlh (without features selection)


In [None]:
#Run unique number
unique_no = str(int(round(time.time())))

# Started by getting the cleaned data
data = data_cleaning('data_module/AmesHousing.csv', normalize=False)
target = 'SalePrice'

init_Ratio = 0.5
cross_rate = 0.5
mutate_rate = 0.002
pop_size = 120
n_generations = 100
elitism = 0.05
ga_ann_iterations =100
ga_ann_layers =2
mape_ann_iterations =1000
mape_ann_layers = 4


# Run the GA ANN
evolution, bestFeatures, bestPredictions, initial_mape, final_mape, mape_y_test,final_mape_prediction,initial_mape_prediction,train_y_test = run(data, target, init_Ratio, cross_rate, mutate_rate, pop_size, n_generations, elitism, 
ga_ann_iterations, ga_ann_layers, mape_ann_iterations, mape_ann_layers,
ga_score='score', ga_evolve='elitism',
final_mape_idx='best')



### Visualization of prediction result without feature selection approach (ANN only)
    - Print the Mean Absolute Percentage Error
    - Plot the predictions vs real price (y test)

In [None]:
## Printing and plotting the result of the mape
print('Mape for prediction result without feature selection approach (ANN only) = ',initial_mape,'%')

#Setting Font size for the plots
plt.rc('axes', titlesize=14)     # axes title
plt.rc('axes', labelsize=13)     # x and y labels
plt.rc('xtick', labelsize=11)    # tick labels
plt.rc('ytick', labelsize=11)    # tick labels
plt.rc('legend', fontsize=13)    # legend fontsize

## Plotting the Predictions vs real prices
x = np.arange(0., len(mape_y_test), 1)
y = np.array(list(mape_y_test))
pi= np.array(list(initial_mape_prediction))
idx = y.argsort()

fig2 = plt.figure(figsize=(15,25))
ax4 = fig2.add_subplot(3,1,3)
ax4.plot(x, y[idx], 'ro',label='Real Price (Y test)')
ax4.plot(x, pi[idx], 'bo',label='Predictions')
plt.title('Plot of Prediction Vs Ytest without feature selection approach (ANN Only)',fontweight="bold")
ax4.set_ylabel('SalePrice')
ax4.legend()
plt.savefig('ga_module/results/'+unique_no+'predictions_ANN.png')


### Visualization of prediction result with feature selection approach (GA-ANN)
    - Print the Mean Absolute Percentage Error 
    - Plot the predictions vs real price (y test)
    - Barplot the occurence of best features (from best individual from each generation)
    - Illustrating the frequency of two features occuring together using heatmap

In [None]:
print('Mape for prediction result with feature selection approach (GA-ANN):',final_mape,'%')

fig = plt.figure(figsize=(15,25))
#Plotting the Ytest against the prediction
p = np.array(list(final_mape_prediction))

#we already get the x, y, and idx from section above
ax = fig.add_subplot(3,1,1)
ax.plot(x, y[idx], 'ro',label='Real Price (Y test)')
ax.plot(x, p[idx], 'bo',label='Predictions')
plt.title('Plot of Prediction Vs Ytest with feature selection approach (GA-ANN)',fontweight="bold")
ax.set_ylabel('SalePrice')
ax.legend()

#Plotting the lowest RMSE for each generation. Note: evolution is the best MSE for each generation 
ax2 = fig.add_subplot(3,1,2)
ax2.plot(np.sqrt(evolution), color='b',linestyle='dashed',marker='o',label='test')
plt.title('Plot of the lowest RMSE for each generation',fontweight="bold")
#set the xticks
#major_ticks = np.arange(0, len(evolution), 1)
#ax2.set_xticks(major_ticks)
ax2.set_ylabel('Root Mean Square Error(RMSE)')
ax2.set_xlabel('Generation no.')
ax2.grid()

#Plotting the number of features which produce lowest RMSE for each generation
num_features=[len(x) for x in bestFeatures]
ax3= fig.add_subplot(3,1,3)
ax3.plot(num_features, color='r',linestyle='dashed',marker='o',label='test')
plt.title('Plot of number of features for each generation from the best individual',fontweight="bold")
major_ticks2 = np.arange(0, len(evolution), 1)
#ax3.set_xticks(major_ticks2)
ax3.set_ylabel('Number of Features')
ax3.set_xlabel('Generation no.')
ax3.grid()
plt.savefig('ga_module/results/'+unique_no+'predictions_GA_ANN.png')

print('')

In [None]:
## Setting Barplot the occurence of best features (from best individual from each generation)

#Get the unique features from all of the features occured in best individual from all generations
#Get the count (number of occurence) of unique features 
unique, counts = np.unique(np.concatenate(bestFeatures), return_counts=True)

#Create dictionary pairing the unique features and the number of occurence
features_dict=dict(zip(unique, counts))

#make dataframe from the dictionary
features_dataframe = pd.DataFrame([features_dict])

#set up Barplot to the dataframe to visualize features occurences
sns.set(font_scale=1)
features_dataframe_plot = features_dataframe.sum(axis=0)
features_dataframe_plot.sort_values(ascending=True,inplace=True)
feature_plot = features_dataframe_plot.plot.barh(figsize=(10,20),alpha=1, title='Features occurences')
feature_fig = feature_plot.get_figure()
feature_fig.savefig('ga_module/results/'+unique_no+'featuresocc.png')

In [None]:
#Illustrating the frequency of two features occuring together using heatmap
df_feat = pd.DataFrame(0, index = np.arange(len(bestFeatures)), columns = unique)
df_feat['GenerationNo']=0
for i in range(len(bestFeatures)):
    df_feat.loc[i,bestFeatures[i]] = 1
    df_feat.loc[i,'GenerationNo'] = i
    
#Make a dataframe which shows how many time each feature occurs together with another features
heatmap_df = pd.DataFrame(0, index = unique, columns = unique)

#Filling in the dataframe with the number of time two features occurs together
for j in range(len(unique)):
    df_slice = df_feat[df_feat[unique[j]] ==1]
    for k in range(j+1,len(unique)):
        heatmap_df.loc[unique[k],unique[j]]=sum(df_slice.loc[:,unique[k]])
        heatmap_df.loc[unique[j],unique[k]]=sum(df_slice.loc[:,unique[k]])
        
        
#Illustrate the intensity of each feature occurs together with another features using heatmap
sns.set(font_scale=6)
colormap = sns.diverging_palette(220, 10, as_cmap=True)
plt.figure(figsize=(120,100))
sns.heatmap(heatmap_df,cmap=colormap,cbar=True)
plt.savefig('ga_module/results/'+unique_no+'heatmap.png')


### Saving Parameters and results to dataframe and file CSV

In [None]:

#Saving dataframe to excel file
list_columns = ['unique','generation_no','init_Ratio','cross_rate','mutate_rate','pop_size','n_generations','elitism',
                'ga_ann_iterations','ga_ann_layers','mape_ann_iterations','mape_ann_layers','evolution',
               'bestFeatures','bestPredictions','initial_mape','final_mape']
list_columns_2 = ['unique','mape_y_test','final_mape_prediction','initial_mape_prediction']

df_result = pd.DataFrame(0, index = np.arange(len(bestFeatures)), columns = list_columns)
df_predictions = pd.DataFrame(0,index = np.arange(len(mape_y_test)),columns=list_columns_2)


In [None]:
df_result.unique=unique_no
df_result.init_Ratio = init_Ratio
df_result.cross_rate = cross_rate
df_result.mutate_rate = mutate_rate
df_result.pop_size = pop_size
df_result.n_generations = n_generations
df_result.elitism = elitism
df_result.ga_ann_iterations =ga_ann_iterations
df_result.ga_ann_layers =ga_ann_layers
df_result.mape_ann_iterations =mape_ann_iterations
df_result.mape_ann_layers = mape_ann_layers
df_result.evolution = evolution
df_result.bestFeatures= bestFeatures
df_result.bestPredictions =bestPredictions
df_result.initial_mape =initial_mape
df_result.final_mape = final_mape
df_result.generation_no = np.arange(len(bestFeatures))

df_predictions.unique = unique_no
df_predictions.mape_y_test = mape_y_test
df_predictions.final_mape_prediction =final_mape_prediction
df_predictions.initial_mape_prediction =initial_mape_prediction


In [None]:
df_predictions.to_excel('ga_module/results/'+unique_no+'predictions.xlsx')
df_result.to_excel('ga_module/results/'+unique_no+'results.xlsx')
np.save('ga_module/y_tests/'+unique_no,train_y_test)
np.save('ga_module/predictions/'+unique_no,bestPredictions)