<a href="https://colab.research.google.com/github/AbinDavis101/Beijing-Weather-App/blob/main/st20316388_CMP7005_PRAC1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install streamlit



In [2]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.graph_objects as go
import numpy as np

st.set_page_config(
    page_title="Beijing Air Quality Analysis",
    page_icon="🌬️",
    layout="wide"
)
st.title("Beijing Air Quality Analysis")
st.markdown("""
This application analyzes air quality data from various monitoring stations in Beijing, China.
The dataset covers the period from March 1st, 2013 to February 28th, 2017 and includes both
air pollutants and meteorological conditions.
""")
# navigation sidebar
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Data Overview", "Exploratory Data Analysis", "Model Evaluation"])


df = pd.read_csv('combined_df.csv')


def preprocess_data(df):

    processed_df = df.copy()



    numerical_cols = processed_df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_cols:
        processed_df[col] = processed_df[col].fillna(processed_df[col].median())


    categorical_cols = processed_df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        processed_df[col] = processed_df[col].fillna(processed_df[col].mode()[0])


    processed_df['datetime'] = pd.to_datetime(processed_df[['year', 'month', 'day', 'hour']])


    processed_df['season'] = processed_df['month'].apply(
        lambda x: 'Winter' if x in [12, 1, 2] else
                  'Spring' if x in [3, 4, 5] else
                  'Summer' if x in [6, 7, 8] else 'Autumn'
    )


    processed_df['wd'] = processed_df['wd'].astype('category')


    processed_df = processed_df.drop_duplicates()

    return processed_df

# Function for building a model (kept for model evaluation only)
def build_model(X_train, X_test, y_train, y_test):
          # Scale features
          scaler = StandardScaler()
          X_train_scaled = scaler.fit_transform(X_train)
          X_test_scaled = scaler.transform(X_test)

         # Define models
          models = {
              'Linear Regression': LinearRegression(),
              'Random Forest': RandomForestRegressor(random_state=42)
          }

          results = {}
          for name, model in models.items():
              # Train model
              model.fit(X_train_scaled, y_train)

             # Make predictions
              y_pred = model.predict(X_test_scaled)

              # Evaluate
              mse = mean_squared_error(y_test, y_pred)
              rmse = np.sqrt(mse)
              r2 = r2_score(y_test, y_pred)

              results[name] = {
                  'model': model,
                  'mse': mse,
                  'rmse': rmse,
                  'r2': r2,
                  'predictions': y_pred
              }


          return results

if df is not None:

    processed_df = preprocess_data(df)
    # DATA OVERVIEW PAGE
    if page == "Data Overview":
        st.header("Data Overview")

        col1, col2 = st.columns(2)

        with col1:
            st.subheader("Dataset Information")
            st.write(f"Number of records: {processed_df.shape[0]}")
            st.write(f"Number of features: {processed_df.shape[1]}")
            st.write(f"Time period: {processed_df['datetime'].min()} to {processed_df['datetime'].max()}")
            st.write(f"Number of stations: {processed_df['station'].nunique()}")
            st.write("Stations:", ", ".join(processed_df['station'].unique()))

        with col2:
            st.subheader("Data Types")
            st.write(processed_df.dtypes)

        st.subheader("Sample Data")
        st.dataframe(processed_df.head())

        st.subheader("Missing Values")
        missing_data = pd.DataFrame({
            'Missing Values': processed_df.isnull().sum(),
            'Percentage': (processed_df.isnull().sum() / len(processed_df)) * 100
        })
        st.dataframe(missing_data)

        # Visualize missing values
        st.subheader("Missing Values Visualization")
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(processed_df.isnull(), yticklabels=False, cbar=False, cmap='viridis', ax=ax)
        st.pyplot(fig)

 # EXPLORATORY DATA ANALYSIS PAGE
    elif page == "Exploratory Data Analysis":
        st.header("Exploratory Data Analysis")


        st.sidebar.subheader("EDA Options")

        eda_choice = st.sidebar.selectbox(
            "Select Analysis Type",
            ["Univariate Analysis", "Bivariate Analysis", "Multivariate Analysis", "Time Series Analysis"]
        )

        if eda_choice == "Univariate Analysis":
            st.subheader("Univariate Analysis")

            variable = st.selectbox(
                "Select Variable",
                ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']
            )

            col1, col2 = st.columns(2)

            with col1:
                st.subheader(f"Distribution of {variable}")
                fig = px.histogram(processed_df, x=variable, nbins=50, marginal="box")
                st.plotly_chart(fig)

            with col2:
                st.subheader(f"Statistics for {variable}")
                stats = processed_df[variable].describe()
                st.dataframe(stats)

                st.subheader(f"Box Plot for {variable}")
                fig = px.box(processed_df, y=variable)
                st.plotly_chart(fig)

        elif eda_choice == "Bivariate Analysis":
            st.subheader("Bivariate Analysis")

            x_var = st.selectbox(
                "Select X Variable",
                ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM'],
                index=0
            )

            y_var = st.selectbox(
                "Select Y Variable",
                ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM'],
                index=1
            )

            st.subheader(f"Scatter Plot: {x_var} vs {y_var}")
            fig = px.scatter(processed_df, x=x_var, y=y_var, color='station', opacity=0.6)
            st.plotly_chart(fig)

            st.subheader(f"Correlation between {x_var} and {y_var}")
            corr = processed_df[[x_var, y_var]].corr().iloc[0, 1]
            st.write(f"Correlation coefficient: {corr:.4f}")

            if st.checkbox("Show correlation by station"):
                for station in processed_df['station'].unique():
                    station_df = processed_df[processed_df['station'] == station]
                    corr = station_df[[x_var, y_var]].corr().iloc[0, 1]
                    st.write(f"{station}: {corr:.4f}")

        elif eda_choice == "Multivariate Analysis":
            st.subheader("Multivariate Analysis")

            st.subheader("Correlation Matrix")
            corr_vars = st.multiselect(
                "Select Variables for Correlation Matrix",
                ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM'],
                default=['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']
            )

            if corr_vars:
                corr_matrix = processed_df[corr_vars].corr()
                fig = px.imshow(corr_matrix, text_auto=True, color_continuous_scale='RdBu_r')
                st.plotly_chart(fig)

            st.subheader("Pollutant Analysis by Station")
            selected_pollutant = st.selectbox(
                "Select Pollutant",
                ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']
            )

            fig = px.box(processed_df, x='station', y=selected_pollutant, color='station')
            st.plotly_chart(fig)

            st.subheader("Seasonal Analysis")
            selected_season_var = st.selectbox(
                "Select Variable for Seasonal Analysis",
                ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']
            )

            fig = px.box(processed_df, x='season', y=selected_season_var, color='season')
            st.plotly_chart(fig)

        elif eda_choice == "Time Series Analysis":
            st.subheader("Time Series Analysis")

            # Aggregate by day for better visualization
            daily_df = processed_df.groupby([processed_df['datetime'].dt.date, 'station']).agg({
                'PM2.5': 'mean',
                'PM10': 'mean',
                'SO2': 'mean',
                'NO2': 'mean',
                'CO': 'mean',
                'O3': 'mean',
                'TEMP': 'mean'
            }).reset_index()


            daily_df['datetime'] = pd.to_datetime(daily_df['datetime'])

            selected_stations = st.multiselect(
                "Select Stations for Time Series",
                processed_df['station'].unique(),
                default=[processed_df['station'].unique()[0]]
            )

            selected_var = st.selectbox(
                "Select Variable for Time Series",
                ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP']
            )

            # Filter data
            filtered_df = daily_df[daily_df['station'].isin(selected_stations)]

            # Create time series plot
            fig = px.line(filtered_df, x='datetime', y=selected_var, color='station',
                          title=f'{selected_var} Over Time')
            st.plotly_chart(fig)

            # Moving average
            if st.checkbox("Show moving average"):
                window_size = st.slider("Select Window Size (Days)", 7, 90, 30)

                for station in selected_stations:
                    station_data = filtered_df[filtered_df['station'] == station].copy()
                    station_data[f'{selected_var}_MA'] = station_data[selected_var].rolling(window=window_size).mean()

                    fig = px.line(station_data, x='datetime', y=[selected_var, f'{selected_var}_MA'],
                                  title=f'{selected_var} with {window_size}-Day Moving Average for {station}')
                    st.plotly_chart(fig)

  # MODEL EVALUATION PAGE (renamed "Modeling and Prediction")
    elif page == "Model Evaluation":

        st.header("Model Evaluation")


        st.sidebar.subheader("Model Options")

        target_var = st.sidebar.selectbox(
            "Select Target Variable to Evaluate",
            ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3'],
            index=0
        )

        selected_station = st.sidebar.selectbox(
            "Select Station for Modeling",
            processed_df['station'].unique()
        )


        station_df = processed_df[processed_df['station'] == selected_station].copy()


        feature_options = ['month', 'day', 'hour', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM',
                          'SO2', 'NO2', 'CO', 'O3', 'PM2.5', 'PM10']

        feature_options.remove(target_var)

        selected_features = st.sidebar.multiselect(
            "Select Features for Model",
            feature_options,
            default=['month', 'hour', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']
        )

        if st.sidebar.button("Evaluate Models"):
            if len(selected_features) > 0:
                st.info("Evaluating models... This may take a moment.")

                # Prepare data
                X = station_df[selected_features]
                y = station_df[target_var]

                # Split data
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)







                # Build and evaluate models

                model_results = build_model(X_train, X_test, y_train, y_test)

                # Display results
                st.subheader("Model Performance")

                results_df = pd.DataFrame({
                    'Model': list(model_results.keys()),
                    'MSE': [res['mse'] for res in model_results.values()],
                    'RMSE': [res['rmse'] for res in model_results.values()],
                    'R²': [res['r2'] for res in model_results.values()]
                })

                st.dataframe(results_df)

                # Visualize predictions vs actual for the best model
                best_model_name = results_df.iloc[results_df['R²'].argmax()]['Model']
                best_model_results = model_results[best_model_name]

                st.subheader(f"Predictions vs Actual ({best_model_name})")

                fig = px.scatter(
                    x=y_test, y=best_model_results['predictions'],
                    labels={'x': 'Actual', 'y': 'Predicted'},
                    title=f"{best_model_name}: Actual vs Predicted {target_var}"
                )

                # Add perfect prediction line
                fig.add_trace(go.Scatter(
                    x=[y_test.min(), y_test.max()],
                    y=[y_test.min(), y_test.max()],
                    mode='lines',
                    name='Perfect Prediction',
                    line=dict(color='red', dash='dash')
                ))

                st.plotly_chart(fig)

                # Feature importance for Random Forest
                if 'Random Forest' in model_results:
                    rf_model = model_results['Random Forest']['model']
                    feature_importance = pd.DataFrame({
                        'Feature': selected_features,
                        'Importance': rf_model.feature_importances_
                    }).sort_values(by='Importance', ascending=False)

                    st.subheader("Feature Importance (Random Forest)")
                    fig = px.bar(feature_importance, x='Feature', y='Importance')
                    st.plotly_chart(fig)

Overwriting app.py


In [3]:
!wget -q -O - ipv4.icanhazip.com

34.59.187.219


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.59.187.219:8501[0m
[0m
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0Kyour url is: https://public-mails-feel.loca.lt
2025-05-11 00:35:15.803 Serialization of dataframe to Arrow table was unsuccessful. Applying automatic fixes for column types to make the dataframe Arrow-compatible.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/streamlit/dataframe_util.py", line 822, in convert_pandas_df_to_arrow_bytes
    table = pa.Table.from_pandas(df)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "pyarrow/table.pxi", line 4751, in pyarrow.lib.Table.from_pandas
  File "/usr/local/lib/python3.11/dist-packages/pyarrow/pandas_