In [14]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

406 observations on 8 features:

1. MPG (miles per gallon)
2. cylinders
3. engine displacement (cu. inches)
4. horsepower
5. vehicle weight (lbs.)
6. time to accelerate from O to 60 mph (sec.)
7. model year (modulo 100)
8. origin of car (
          1. American
          2. European,
          3. Japanese
        ).

Feature labels: mpg cys dis hor wei acc myr ori

In [15]:
data = []
dim = []
(row, col) = (0, 0)
labels = ['mpg', 'cys', 'dis', 'hor', 'wei', 'acc', 'myr', 'ori']
df = pd.DataFrame()

## 数据读入模块

In [16]:
# 读取数据集
def read_data(mode: str = 'r'):
    data_set = open('dataset/car.txt', mode)
    flag = 0
    for line_raw in data_set:
        line_raw = line_raw.split(" ")
        line = []
        for words in line_raw:
            if words != '':
                line.append(words)

        # 丢弃掉所有描述性文字，只保留数据部分
        if line[0] == 'mpg':
            flag = 1
            continue
        if flag == 1:
            features = []

            count = 0
            for items in line:
                count += 1
                if items == 'NA':
                    items = '-1'
                if count == 8:
                    # 修建掉尾部换行符
                    items = items[0:-1]
                features.append(items)
            data.append(features)
    # 至此，所有数据已按行读取成功

In [17]:
# 统计每种属性的维度
def dim_num():
    for j in range(0, col):
        list_col = set([])
        for i in range(0, row):
            list_col.add(data[i][j])
        dim.append(len(list_col))

In [18]:
# 组织起一个 DataFrame 数据结构
def make_dt():
    df = pd.DataFrame(
        data,
        columns=labels,
        dtype=float
    )
    # print(df)
    return df

In [19]:
# 整合初始化启动函数
def init():
    read_data()
    global row
    global col
    row = len(data)
    col = len(data[0])
    dim_num()
    global df
    df = make_dt()

In [20]:
# 求出各部分最大最小值
df.max()

Series([], dtype: float64)

In [21]:
df.min()

Series([], dtype: float64)

## 绘图部分

In [22]:
if __name__ == '__main__':
    init()
    print(df)

      mpg  cys    dis    hor     wei   acc   myr  ori
0    18.0  8.0  307.0  130.0  3504.0  12.0  70.0  1.0
1    15.0  8.0  350.0  165.0  3693.0  11.5  70.0  1.0
2    18.0  8.0  318.0  150.0  3436.0  11.0  70.0  1.0
3    16.0  8.0  304.0  150.0  3433.0  12.0  70.0  1.0
4    17.0  8.0  302.0  140.0  3449.0  10.5  70.0  1.0
..    ...  ...    ...    ...     ...   ...   ...  ...
401  27.0  4.0  140.0   86.0  2790.0  15.6  82.0  1.0
402  44.0  4.0   97.0   52.0  2130.0  24.6  82.0  2.0
403  32.0  4.0  135.0   84.0  2295.0  11.6  82.0  1.0
404  28.0  4.0  120.0   79.0  2625.0  18.6  82.0  1.0
405  31.0  4.0  119.0   82.0  2720.0  19.4  82.0  1.0

[406 rows x 8 columns]


In [23]:
fig = px.parallel_coordinates(
        df,
        color="wei",
        color_continuous_scale=px.colors.diverging.Tealrose,
        dimensions=labels,
        title="The 4th Data Visualization Assignment"
    )
fig.show()

In [24]:
fig = go.Figure(data=
    go.Parcoords(
        line = dict(color = df['hor'],
                   colorscale = px.colors.sequential.Oranges),
        dimensions = [dict(label=col, values=df[col]) for col in labels]
    )
)

fig.update_layout(
    title="The 4th Data Visualization Assignment"
)

fig.show()

In [25]:
fig = go.Figure(data=
    go.Parcoords(
        line = dict(
            color = df['ori'],
            colorscale = [[0, 'green'], [0.5, 'rgb(0, 0, 255)'], [1.0, 'red']]),
        
        dimensions = [dict(label=col, values=df[col]) for col in labels]
    )
)# go.Figure() ends here

fig.update_layout(
    title="The 4th Data Visualization Assignment"
)

fig.show()

根据题目中的索引，`ori`项所对应的产地分别为:  
    1. American,  
    2. European,  
    3. Japanes;  

## 数据处理部分

### 维度重排

- 【定义】局外数据点：在两个相邻维度之间，和周围的数据点孤立，有一定距离的点，称为局外数据点。

  因为这种局外数据点不遵循数据集的整体模式。 有时我们能 通过这种局外数据点得到额外的信息。但这里我们认为局外数据点混淆了平行坐 标的表示，我们要排列维度使得局外数据点的影响最小。

### 数据点着色

### 聚类方法

### 抽样

### 平行集合（Parallel Sets）

In [26]:
fig = go.Figure(data=
    go.Parcoords(
        line = dict(color = df['hor'],
                   # colorscale = 'Electric',
                   colorscale = [
                       [0, 'rgb(0, 255, 0)'],
                       #[0.5, 'rgb(0, 0, 255)'],
                       [1.0, 'rgb(225, 0, 0)']
                   ],
                   showscale = True,
                   cmin = 250,
                   cmax = 10),
        dimensions = list([
            dict(range = [0,8],
                 constraintrange = [3.5, 4.5],
                 label = 'Cylinders',
                 values = df['cys']
                ),
            dict(range = [0,50],

                 label = "Miles/gallo", 
                 values = df['mpg']
                ),
            dict(range = [50,500],
                 label = 'Engine Displacement', 
                 values = df['dis']
                ),
            dict(range = [10, 250],
                 visible = True,
                 label = 'Horsepower', 
                 values = df['hor']
                ),
            dict(range = [1500, 5200],
                 label = 'Vehicle Weight (lbs.)', 
                 values = df['wei']
                ),
            dict(range = [5,25],
                 label = 'Time to Accelerate(s)', 
                 values = df['acc']
                ),
            dict(range = [70, 82],
                 visible = True,
                 label = 'Model Year', 
                 values = df['myr']
                ),
            dict(tickvals = [1,2,3],
                 ticktext = ['American','European','Japanese'],
                 label = 'Origin of Car', 
                 values = df['ori']
                ),
        ])# end dimensions = list([ ..])
    )
)
fig.show()
fig.write_html("result/test_advanced.html")