In [None]:
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Exploratory Data Analysis\n",
        "This notebook performs exploratory data analysis (EDA) on the dataset to understand its structure and characteristics."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Import necessary libraries\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "%matplotlib inline"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Load the dataset\n",
        "file_path = '../data/raw/train.txt'\n",
        "with open(file_path, 'r') as file:\n",
        "    data = file.read().strip().split()\n",
        "data = [int(i) for i in data]"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Convert data to a pandas DataFrame for easier analysis\n",
        "df = pd.DataFrame(data, columns=['observation'])\n",
        "df.head()"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "metadata": {},
          "data": {
            "text/plain": [
              "   observation\n",
              "0            0\n",
              "1            1\n",
              "2            2\n",
              "3            1\n",
              "4            0"
            ],
            "text/html": [
              "\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>observation</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n"
            ]
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Plot the distribution of observations\n",
        "plt.figure(figsize=(10, 6))\n",
        "df['observation'].value_counts().sort_index().plot(kind='bar')\n",
        "plt.title('Distribution of Observations')\n",
        "plt.xlabel('Observation')\n",
        "plt.ylabel('Frequency')\n",
        "plt.show()"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "metadata": {},
          "data": {
            "text/plain": [],
            "text/html": []
          },
          "execution_count": 4
        },
        {
          "output_type": "display_data",
          "metadata": {},
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcwAAAGUCAYAAABrLPsJAAAgAElEQVR4Xu2deWwc13k/f9+72ffmNAsmoVJJmZ1SUS1SStTKUjP4x9XyNxdUNCMkp1UTZfKhKl1SKKhSRmlxpZJaDpUqJXFEjZoGJqwpAqDEiIfKkQEAiQAYIAgIoAAALwOPOcbnXvr59+mhu3/Da5k5ksstO1z52zO+f8+dNe5/XN+d7HnM4y9ZaZIAggCAyBiIADANMAAIlAABVAAAAeZ6JQAA+egDUQAAIIAAMAIgABUAABUgAOVAAAggAAMAiAANoDAPT1/b5Oltbi0WkGk3+69zKCAu1oQAOgAA9gAACe1ACaAAL4AAItAAm4AAL6AAIWgAn8AANIBAm4AAI+gALaAAL6AALKAAK+AADKAAI+AADKAAO7r5YHGBX1BwGwp4HwNAANMAAm6ArNi2lgAAj9rGZmZarW2INbW1g6+urd2yTNO5c+SABeWlruufPnyMHDk+k+3t7Z/99FOy+98v5PXrG2GNAArLZ0oK0tyeYXVlgUHeAygPqSKABKQD1rDbbYpIsY8QQAeTAFZgM1YBUmA9aw21uKwIx+gIbYW2+a1tyfqSBqKAIYA/VBLaSXAohqBg0wdKMOmjrCAIsAwVhTkgwIQUoClgzlrG5ntEYAJJ7Ldgk8C6tkBqNiOpal5AUC6gG2vXoMoNf2pqZGtxeAR6y2NIGhyUS2Bll+3dnv0VdA7T0lg0mJ60uBwO4oB5aRNFgak8+zCIIsM7RyHD/zzZxcACwFwiq5trECwb2F0gLVgMZvl0kAAKzNZmrm7t27Fx8fj2XLli2y9apVbNp0iV27dqV/PfvXwzDvwE0gIBroAG9g6Vfzyy+EEDkAv1AUgINMAAum7Xb5+fmkSQDDqk7CAmxUxaP7YBwHro/LaI+2HeS3ZkHHnnpKefOnTtz5swKio3NhQAoAHsAAtwz5oLqWlkEQLGA/VBSoC0AAtwza5rbFFYABDUyWBBoA7gINQBaZA9b5taMFLv29CfpRBmbOlF6zfdPtBBGiy6ECY4ALmgtFlp8AWmAg1AFpkD1vg1o4AWAAHce5gAyVwIA74bUVB7VJwiJAKYAIWgAznAY1AFpkD1vptk0u8AMMAQaAZaDBoaYZTKtZcvHr1yoxM0er0b8QlMJYOAFpkD1vgWpv0BAOZB10UAIpgAlkAAokAFmQPW+D2n9wAApgAlkAAokAFmQPW+bS1kAAoAcA1AUg0msTTCiAClgBWkAwpgA1AFpIB6TF6AKmA8pgAlkAAokAFmQPa+w0kAAoQAtgAVpAMpIAlkAAokAFmQPa+w6kAAoQAtgAVpAMpIAlkAAokAFmQPaew9kAAoQAtgAVpAMpIAlkAAokAFmQPaex6kAAoQAtgAVpAMpIAlkAAokAFmQPaew+kgAAoQAtgAVpAMpIAlkAAokAFmQPaex7kgAAoQAtgAVpAMpIAlkAAokAFmQPaew/0gAAoQAtgAVpAMpIAlkAAokAFmQPaex8kgAAoQAtgAVpAMpIAlkAAokAFmQPaewAkAAoQAtgAVpAMpIAlkAAokAFmQPaez2kgAAoQAtgAVpAMpIAlkAAokAFmQPa"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "display_data",
          "metadata": {},
          "data": {
            "image/png": "base64_encoded_image_data_here"
          },
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Summary Statistics\n",
        "Let's calculate some summary statistics for the observations."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Summary statistics\n",
        "summary_stats = df['observation'].describe()\n",
        "summary_stats"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "metadata": {},
          "data": {
            "text/plain": [
              "count    5.000000\n",
              "mean     0.800000\n",
              "std      0.836660\n",
              "min      0.000000\n",
              "25%      0.000000\n",
              "50%      1.000000\n",
              "75%      1.000000\n",
              "max      2.000000\n",
              "Name: observation, dtype: float64"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>observation</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>5.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>0.800000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>0.836660</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>1.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>1.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>2.000000</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ]
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Observation Trends\n",
        "Let's look at the trends in the observation data over time."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Plot the observations over time\n",
        "plt.figure(figsize=(10, 6))\n",
        "plt.plot(df['observation'])\n",
        "plt.title('Observations Over Time')\n",
        "plt.xlabel('Time')\n",
        "plt.ylabel('Observation')\n",
        "plt.show()"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "display_data",
          "metadata": {},
          "data": {
            "image/png": "base64_encoded_image_data_here"
          },
          "execution_count": 6
        }
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
