# Plotly Notebook:
###### The goal is to display the **MSE** and an animated graph of **prediction** with Plotly for each $k$.
`pip install plotly`

`conda install -c plotly plotly=5.13.1`

`-m pip install statsmodels`

In [15]:
import pandas as pd

import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

import plotly.express as px

import seaborn as sns

Open the **.xlsx**

In [16]:
file = "SURF_data.xlsx"
data = pd.read_excel(file , decimal = ',')
data.drop('refroid', axis = 1, inplace=True)

In [17]:
data.head(3)

Unnamed: 0,compacité,Surface,mur,toit,hauteur,orientation,vitrage,or_vitre,chauff
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55



#### Standardisation et One Hot encoding

In [18]:
var_quanti = ['compacité', 'Surface', 'mur', 'toit', 'hauteur', 'vitrage', 'chauff']
data[var_quanti] = StandardScaler().fit_transform(data[var_quanti])
#
var_quali = ['orientation','or_vitre']
data_quali_encoded = pd.get_dummies(data[var_quali])
#
df = pd.concat([data[var_quanti], data_quali_encoded], axis = 1)
#
X_num = df.drop('chauff', axis=1)
Y_num = df['chauff']

#### **KPPV** et regression :
(70% TRAIN - 30% VALIDATION)

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X_num, Y_num, test_size= 0.30)

In [20]:
def regression_simple(X_train : pd.Series, X_test : pd.Series, Y_train : pd.Series, k_neighbors : int = 20) -> np.array([]):
	KNN_Reg = KNeighborsRegressor(n_neighbors=k_neighbors)
	KNN_Reg.fit(X_train, Y_train)
	Y_pred = KNN_Reg.predict(X_test)

	return Y_pred

In [21]:
def MSE(Y_pred : pd.Series, Y_test : pd.Series) -> float:
	return np.sum(np.power(Y_pred - Y_test, 2)).sum()

***

In [22]:
Y_pred = regression_simple(X_train, X_test, Y_train)
df_result_scatter = pd.DataFrame({'obs' : Y_test, 'pred': Y_pred})

In [23]:
px.scatter(df_result_scatter,title="estimation Y par rapport à la valeur réel",template="plotly_dark")

`px.scatter(df_result_scatter)` donne un scatter plot avec les valeurs observées et obtenues.
Chaque point 'obs' à un point 'pred' relié : *ctd* qu'ils sont sur la <u>même abscisse, mais ont des ordonnées différentes dû à l'erreur d'estimation<u>.

***
### Création du DataFrame $\forall k$:
Le df doit contenir la valeur du **MSE** et la **régression** pour chaque $k$.
Le but est de vor l'évolution des predictions et de leurs précisions en fonction du nombre d'itérations $k$ : [exemple](#Plotly-:-Gapminder-example)


##### Faire un df pandas avec comme colonne :
1. index
2. $k_{n}$
3. MSE
4. valeur réelle
5. valeur prédite

In [51]:
def regression_k(kmax : int = 80) -> pd.DataFrame :
	"""
	regression_k renvoie une Série avec toutes les erreurs pour chaque k et renvoie une liste de np.array() qui contiennent les approximations de Y avec regression_simple
	:param kmax: le nombre d'itérations
	:return: mse, res
	"""
	# Création du dataframe
	columns_mane	= ['k_n', 'MSE', 'val_obs_test', 'val_pred']
	types 			= {'k_n': int, 'MSE': float, 'val_obs_test': float,'val_pred' : float}


	df = pd.DataFrame(columns=columns_mane) #, dtype=types, index=range(1, kmax+1))

	for k in range(1, kmax+1):
		mse = MSE(Y_pred, Y_test)
		prediction = regression_simple(X_train, X_test, Y_train, k)

		df.loc[k] = [k, mse, Y_test, prediction]

	return df

In [52]:
test = regression_k()
test

Unnamed: 0,k_n,MSE,val_obs_test,val_pred
1,1,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-1.0886151502170536, 0.26109684566797015, 0.6..."
2,2,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-1.0058076912483545, 0.5943100937336335, 1.34..."
3,3,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-0.9081246408481526, 0.6062105668788358, 1.53..."
4,4,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-0.8592831156480516, 0.627284321406798, 1.266..."
5,5,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-0.8014170649795055, 0.5623771574606741, 1.20..."
...,...,...,...,...
76,76,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-0.9001648945558574, 0.8780685246717569, 1.18..."
77,77,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-0.8938157386140405, 0.8792388463216309, 1.19..."
78,78,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-0.8951434623425422, 0.8709198092753221, 1.18..."
79,79,20.25621,234 -0.692924 347 0.604227 544 1.89344...,"[-0.8963999130869513, 0.8723518444647358, 1.18..."


***
### Plots :
Linegraph de l'erreur en fonction du $k$

In [56]:
import plotly.graph_objs as go

df = regression_k()

# Create the figure object
fig = go.Figure()

# Add a trace for the actual values
fig.add_trace(go.Scatter(x=df['k_n'], y=df['val_obs_test'], name='Actual'))

# Add a trace for the predicted values
fig.add_trace(go.Scatter(x=df['k_n'], y=df['val_pred'], name='Predicted'))

# Set the layout properties
fig.update_layout(
    title='K-Nearest Neighbors Regression',
    xaxis_title='Number of Neighbors (k)',
    yaxis_title='Value',
    xaxis=dict(range=[1, 80], autorange=False),
    yaxis=dict(range=[0, 100], autorange=False),
    updatemenus=[dict(
        type='buttons',
        showactive=False,
        buttons=[dict(
            label='Play',
            method='animate',
            args=[None, dict(frame=dict(duration=500, redraw=True), fromcurrent=True)]
        ), dict(
            label='Pause',
            method='animate',
            args=[[None], dict(frame=dict(duration=0, redraw=False), mode='immediate', transition=dict(duration=0))]
        )]
    )]
)

# Create the frames for the animation
frames = [go.Frame(data=[go.Scatter(x=df['k_n'][:i], y=df['val_obs_test'][:i], name='Actual'),
                         go.Scatter(x=df['k_n'][:i], y=df['val_pred'][:i], name='Predicted')]) for i in range(1, len(df))]

# Add the frames to the figure
fig.frames = frames

# Add a slider to control the animation
fig.update_layout(
    updatemenus=[dict(
        type='buttons',
        showactive=False,
        buttons=[dict(
            label='Play',
            method='animate',
            args=[None, dict(frame=dict(duration=500, redraw=True), fromcurrent=True)]
        ), dict(
            label='Pause',
            method='animate',
            args=[[None], dict(frame=dict(duration=0, redraw=False), mode='immediate', transition=dict(duration=0))]
        )]
    ), dict(
        type='slider',
        active=len(df)-1,
        steps=[dict(
            label=str(k),
            method='animate',
            args=[[f.name] for f in fig.frames[:i+1]]
        ) for i, k in enumerate(df['k_n'])]
    )]
)

# Show the figure
fig.show()

ValueError: 
    Invalid value of type 'builtins.str' received for the 'type' property of layout.updatemenu
        Received value: 'slider'

    The 'type' property is an enumeration that may be specified as:
      - One of the following enumeration values:
            ['dropdown', 'buttons']

<br/>
<br/>
<br/>
<br/>

***
### Plotly : Gapminder example
[plotly demo](https://plotly.com/python/animations/)
***

In [14]:
df = px.data.gapminder()
print("df.size \t : {}".format(df.size))
print("df.shape \t : {}".format(df.shape))
print("df.columns \t : {} ".format(df.columns.to_list()))

df.size 	 : 13632
df.shape 	 : (1704, 8)
df.columns 	 : ['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap', 'iso_alpha', 'iso_num'] 


In [15]:
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306,ZWE,716
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786,ZWE,716
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996,ZWE,716
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623,ZWE,716
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298,ZWE,716


In [28]:
df = px.data.gapminder()
px.scatter(df, x="gdpPercap", y="lifeExp", animation_frame="year",
           #animation_group="country",
           #size="pop",
           #color="continent", hover_name="country",
           log_x=True,
           size_max=55, range_x=[100,100000], range_y=[25,90]
           )

***


# tests

In [58]:
nb_records = 3
nb_k = 8

pattern_k = [i for i in range(1, nb_k+1) for j in range(nb_records)]
k = np.repeat(pattern_k,1)

pattern_mse = [np.abs(np.random.randn(1)) for i in range(1, nb_k+1, nb_k) for j in range(nb_k)]
mse = np.repeat(pattern_mse,nb_records)


obs = np.sort(np.abs(np.random.randn(nb_records*nb_k)*10))
pred = np.abs(np.random.randn(nb_records*nb_k)*10)

data = {
	'k_n'	: k,
	'mse'	: mse,
	'obs'	: obs,
	'pred'	: pred
}

"""
print(k.shape)
print(mse.shape)
print(pred.shape)
print(obs.shape)
print(df_test.shape)
"""

df_test = pd.DataFrame(data)
df_test.head(200)

Unnamed: 0,k_n,mse,obs,pred
0,1,0.390154,0.192464,1.693597
1,1,0.390154,1.1878,4.737395
2,1,0.390154,1.448105,13.683579
3,2,1.173389,3.025048,12.469756
4,2,1.173389,3.109643,0.07359
5,2,1.173389,3.315368,19.935769
6,3,1.513044,3.415407,15.100026
7,3,1.513044,3.796096,5.833252
8,3,1.513044,5.295615,12.625375
9,4,1.017492,6.514429,19.640705


In [9]:
import pandas as pd
import plotly.express as px
import statsmodels as sm


px.scatter(df_test, x="obs", y="pred", animation_frame="k_n",
           #animation_group="mse",
           size="obs",
           color="mse", hover_name="mse",
           log_x=False,
           size_max=55,range_x=[0.1,30], range_y=[-0.5,30],
           trendline="ols"
           )