In [None]:
from clickhouse_driver import Client
import pandas as pd

client = Client(host='localhost', database='metro')
query = "SELECT * FROM station"
data = client.execute(query)
df = pd.DataFrame(data, columns=['C/A', 'Unit', 'SCP', 'Station', 'Date', 'Time', 'Description', 'Entries', 'Exits', 'datetime','turnstile'])
df.head()

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd



# Преобразование данных
df['datetime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'])
df['entries_diff'] = df.groupby('turnstile')['Entries'].diff().fillna(0)
df['exits_diff'] = df.groupby('turnstile')['Exits'].diff().fillna(0)
df['traffic'] = df['entries_diff'] + df['exits_diff']

# Создание приложения Dash
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Анализ данных станции метро"),
    html.Label("Выберите станцию:"),
    dcc.Dropdown(
        id='station-dropdown',
        options=[{'label': station, 'value': station} for station in df['Station'].unique()],
        value=df['Station'].unique()[0]
    ),
    html.Label("Выберите временной промежуток:"),
    dcc.DatePickerRange(
        id='date-picker',
        start_date=df['datetime'].min().date(),
        end_date=df['datetime'].max().date()
    ),
    dcc.Graph(id='traffic-graph'),
    dcc.Graph(id='capacity-graph'),
    dcc.Graph(id='top-stations-graph'),
    dcc.Graph(id='average-passengers-graph')
])

@app.callback(
    [Output('traffic-graph', 'figure'),
     Output('capacity-graph', 'figure'),
     Output('top-stations-graph', 'figure'),
     Output('average-passengers-graph', 'figure')],
    [Input('station-dropdown', 'value'),
     Input('date-picker', 'start_date'),
     Input('date-picker', 'end_date')]
)
def update_graphs(selected_station, start_date, end_date):
    filtered_df = df[(df['Station'] == selected_station) & 
                     (df['datetime'] >= pd.to_datetime(start_date)) & 
                     (df['datetime'] <= pd.to_datetime(end_date))]

    # 1. Загруженность станции
    max_traffic = df['traffic'].max()
    filtered_df['load_percent'] = (filtered_df['traffic'] / max_traffic) * 100
    load_fig = px.line(filtered_df, x='datetime', y='load_percent', title='Загруженность станции (%)')

    # 2. Пропускная способность
    capacity_fig = px.line(filtered_df, x='datetime', y='traffic', title='Пропускная способность станции')

    # 3. Топ загруженных станций
    top_stations = df.groupby('Station')['traffic'].sum().reset_index()
    top_stations = top_stations.sort_values(by='traffic', ascending=False).head(10)
    top_stations_fig = px.bar(top_stations, x='Station', y='traffic', title='Топ загруженных станций')

    # 4. Среднее количество пассажиров
    avg_passengers = filtered_df['traffic'].mean()
    avg_passengers_fig = px.line(filtered_df, x='datetime', y='traffic', title=f'Среднее количество пассажиров: {avg_passengers:.2f}')

    return load_fig, capacity_fig, top_stations_fig, avg_passengers_fig

if __name__ == '__main__':
    app.run_server(debug=True)

In [None]:
from clickhouse_driver import Client
import pandas as pd

client = Client(host='localhost', database='metro')
query = "SELECT * FROM station"
data = client.execute(query)
df = pd.DataFrame(data, columns=['C/A', 'Unit', 'SCP', 'Station', 'Date', 'Time', 'Description', 'Entries', 'Exits', 'datetime','turnstile'])

# Преобразование столбца datetime в тип данных datetime
df['datetime'] = pd.to_datetime(df['datetime'])

# Извлечение дополнительных признаков из datetime
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek

# Вычисление количества входов и выходов за каждый интервал времени
df['entry_diff'] = df.groupby('turnstile')['Entries'].diff().fillna(0)
df['exit_diff'] = df.groupby('turnstile')['Exits'].diff().fillna(0)

# Удаление строк с отрицательными значениями entry_diff и exit_diff
df = df[(df['entry_diff'] >= 0) & (df['exit_diff'] >= 0)]

# Проверка данных
print(df.head())

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

# Подготовка данных для кластеризации
features = df[['hour', 'day_of_week', 'entry_diff', 'exit_diff']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_features)

# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan_labels = dbscan.fit_predict(scaled_features)

# Иерархическая кластеризация
agg_clustering = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_clustering.fit_predict(scaled_features)

# Добавление меток кластеров в данные
df['kmeans_cluster'] = kmeans_labels
df['dbscan_cluster'] = dbscan_labels
df['agg_cluster'] = agg_labels

# Оценка качества кластеризации
kmeans_silhouette = silhouette_score(scaled_features, kmeans_labels)
dbscan_silhouette = silhouette_score(scaled_features, dbscan_labels)
agg_silhouette = silhouette_score(scaled_features, agg_labels)

kmeans_db = davies_bouldin_score(scaled_features, kmeans_labels)
dbscan_db = davies_bouldin_score(scaled_features, dbscan_labels)
agg_db = davies_bouldin_score(scaled_features, agg_labels)

# Печать результатов
print(f"K-Means Silhouette Score: {kmeans_silhouette}, Davies-Bouldin Score: {kmeans_db}")
print(f"DBSCAN Silhouette Score: {dbscan_silhouette}, Davies-Bouldin Score: {dbscan_db}")
print(f"Agglomerative Clustering Silhouette Score: {agg_silhouette}, Davies-Bouldin Score: {agg_db}")


In [None]:
# Подготовка данных для кластеризации
features = df[['hour', 'day_of_week', 'entry_diff', 'exit_diff']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
# K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_features)

print(df.head())

In [None]:
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan_labels = dbscan.fit_predict(scaled_features)

In [None]:
# Иерархическая кластеризация
agg_clustering = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_clustering.fit_predict(scaled_features)

In [None]:
# Добавление меток кластеров в данные
df['kmeans_cluster'] = kmeans_labels
# df['dbscan_cluster'] = dbscan_labels
# df['agg_cluster'] = agg_labels
df.head()

In [None]:
# Оценка качества кластеризации
kmeans_silhouette = silhouette_score(scaled_features, kmeans_labels)
# dbscan_silhouette = silhouette_score(scaled_features, dbscan_labels)
# agg_silhouette = silhouette_score(scaled_features, agg_labels)

kmeans_db = davies_bouldin_score(scaled_features, kmeans_labels)
# dbscan_db = davies_bouldin_score(scaled_features, dbscan_labels)
# agg_db = davies_bouldin_score(scaled_features, agg_labels)

# Печать результатов
print(f"K-Means Silhouette Score: {kmeans_silhouette}, Davies-Bouldin Score: {kmeans_db}")
# print(f"DBSCAN Silhouette Score: {dbscan_silhouette}, Davies-Bouldin Score: {dbscan_db}")
# print(f"Agglomerative Clustering Silhouette Score: {agg_silhouette}, Davies-Bouldin Score: {agg_db}")

In [None]:
# Визуализация кластеров K-Means
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='hour', y='entry_diff', hue='kmeans_cluster', palette='viridis')
plt.title('K-Means Clustering')
plt.show()

# # Визуализация кластеров DBSCAN
# plt.figure(figsize=(12, 6))
# sns.scatterplot(data=data, x='hour', y='entry_diff', hue='dbscan_cluster', palette='viridis')
# plt.title('DBSCAN Clustering')
# plt.show()
# 
# # Визуализация кластеров иерархической кластеризации
# plt.figure(figsize=(12, 6))
# sns.scatterplot(data=data, x='hour', y='entry_diff', hue='agg_cluster', palette='viridis')
# plt.title('Agglomerative Clustering')
# plt.show()

In [None]:
# Анализ кластеров K-Means
kmeans_analysis = data.groupby('kmeans_cluster')[['hour', 'day_of_week', 'entry_diff', 'exit_diff']].mean()
print(kmeans_analysis)

# Анализ кластеров DBSCAN
dbscan_analysis = data.groupby('dbscan_cluster')[['hour', 'day_of_week', 'entry_diff', 'exit_diff']].mean()
print(dbscan_analysis)

# Анализ кластеров иерархической кластеризации
agg_analysis = data.groupby('agg_cluster')[['hour', 'day_of_week', 'entry_diff', 'exit_diff']].mean()
print(agg_analysis)

In [None]:
best_algorithm = 'K-Means' if kmeans_silhouette > max(dbscan_silhouette, agg_silhouette) else 'DBSCAN' if dbscan_silhouette > agg_silhouette else 'Agglomerative Clustering'

print(f"The best clustering algorithm is: {best_algorithm}")