### Machine Learning for Engineers: [VisualizeData](https://www.apmonitor.com/pds/index.php/Main/VisualizeData)
- [Data Visualization and Exploration](https://www.apmonitor.com/pds/index.php/Main/VisualizeData)
 - Source Blocks: 15
 - Description: Data visualization and exploration is one of the first steps in machine learning after the data is gathered and summarized with statistics. It is used to graphically represent data to qualitatively understand relationships and data quality.
- [Course Overview](https://apmonitor.com/pds)
- [Course Schedule](https://apmonitor.com/pds/index.php/Main/CourseSchedule)


In [None]:
pip install pandas matplotlib plotly seaborn pandas-profiling

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

In [None]:
import numpy as np
import tclab
import time

filename='TCLab_ss1.txt'
fid = open(filename,'w')
fid.write('Q1,Q2,T1,T2\n')
fid.close()

# Connect to Arduino
a = tclab.TCLabModel()

# random heater values
Q1d = np.random.randint(0,70,size=100)
Q2d = np.random.randint(0,80,size=100)

# collect 100 steady state points (~3 minutes each)
print('Wait 180 seconds between heater points')
print('Full data generation requires 5 hrs!')
for i in range(100):
    # set heater values
    a.Q1(Q1d[i])
    a.Q2(Q2d[i])
    # wait 300 seconds
    time.sleep(300)
    print('Set: ' + str(i) + \
          ' Q1: ' + str(Q1d[i]) + \
          ' Q2: ' + str(Q2d[i]) + \
          ' T1: ' + str(a.T1)   + \
          ' T2: ' + str(a.T2))
    fid = open(filename,'a')
    fid.write(str(Q1d[i])+','+str(Q2d[i])+',' \
              +str(a.T1)+','+str(a.T2)+'\n')
    fid.close()
# close connection to Arduino
a.close()

In [None]:
import pandas as pd
data = pd.read_csv('http://apmonitor.com/pds/uploads/Main/TCLab_ss1.txt')
print(data.head())

In [None]:
print(data.describe())

In [None]:
data.plot()

In [None]:
data.plot(kind='density',subplots=True,layout=(2,2),figsize=(10,6))

In [None]:
data.plot(kind='box', subplots=True, figsize=(12,3))

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(data, explorative=True, minimal=False)
try:
   profile.to_widgets()         # view as widget in Notebook
except:
   profile.to_file('data.html') # save as html file

In [None]:
import matplotlib.pyplot as plt
plt.scatter(data['Q1'],data['T1'])

# add labels and title
plt.xlabel('Heater (%)')
plt.ylabel('Temperature (Â°C)')
plt.legend()
plt.show()

In [None]:
import plotly.express as px
fig = px.scatter(data, x="Q1", y="T1")
fig.show()

In [None]:
import seaborn as sns
sns.pairplot(data)

In [None]:
import seaborn as sns
sns.heatmap(data.corr())

In [None]:
import pandas as pd
df = pd.read_csv('http://apmonitor.com/pds/uploads/Main/PV_BYU_South.txt')
factors=['Ambient Temperature (C)',
         'Wind Speed (m/s)', 
         'Plane of Array Irradiance (W/m^2)',
         'Cell Temperature (C)', 
         'DC Array Output (W)']

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import plotly.express as px

df = pd.read_csv('http://apmonitor.com/pds/uploads/Main/PV_BYU_South.txt')
factors=['Ambient Temperature (C)',
         'Wind Speed (m/s)', 
         'Plane of Array Irradiance (W/m^2)',
         'Cell Temperature (C)', 
         'DC Array Output (W)']
print(df.columns)
data = df[factors].copy() # take only subset of data columns
# remove rows where there is no sunlight
data = data[data['Plane of Array Irradiance (W/m^2)']>0.01]
# calculate efficiency (use PV Cell m^2 to get true efficiency)
data['efficiency'] = data['DC Array Output (W)'] \
                      /data['Plane of Array Irradiance (W/m^2)']
print(data.head())
print(data.describe())
data.plot()

profile = ProfileReport(data, explorative=True, minimal=False)
try:
   profile.to_widgets()            # view as widget in Notebook
except:
   profile.to_file('PV_Data.html') # html file if widget not available

fig = px.scatter(data, x="Ambient Temperature (C)", \
                       y="Cell Temperature (C)")
fig.show()
sns.pairplot(data)

plt.figure()
x = data['Ambient Temperature (C)']
y = data['Cell Temperature (C)']
plt.scatter(x,y)
plt.xlabel('Ambient Temperature (Â°C)')
plt.ylabel('Cell Temperature (Â°C)')
plt.show()