# Project 1

<u>PREPARATION<u>

In [1]:
import pandas as pd
import plotly as px
import numpy as np
import csv

In [2]:
import plotly.io as pio

pio.renderers.default = "vscode+jupyterlab+notebook_connected"

Data used from https://www.kaggle.com/datasets/sudalairajkumar/daily-temperature-of-major-cities

A dataset from University of Dayton on daily temperature level of major cities.
In this project, data on Singapore will be used.

a. Read in the data

In [3]:
df = pd.read_csv("city_temperature_SG.csv")
df.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Asia,Singapore,,Singapore,1,1,1995,81.1
1,Asia,Singapore,,Singapore,1,2,1995,81.7
2,Asia,Singapore,,Singapore,1,3,1995,82.4
3,Asia,Singapore,,Singapore,1,4,1995,82.6
4,Asia,Singapore,,Singapore,1,5,1995,82.4


b.Compute:
    The mean
    The median
    The mode

In [4]:
mean_temp = df['AvgTemperature'].mean()

median_temp = df['AvgTemperature'].median()

mode_temp = df['AvgTemperature'].mode()
mode_values = ', '.join([f"{value:.1f}" for value in mode_temp]) # there's more than 1 mode
 
print(f"Mean: {mean_temp:.1f}")
print(f"Median: {median_temp:.1f}")
print(f"Mode: {mode_values}")

Mean: 81.7
Median: 82.3
Mode: 81.7, 83.5


Using only the Python standard library

Read in data, keep the header name the same as the original column ('AvgTemperature')

In [5]:
temperatures = []

with open("city_temperature_SG.csv", 'r') as file:
    header = file.readline().strip().split(',')
    temperature_index = header.index('AvgTemperature')  
    for line in file:
        row = line.strip().split(',')
        temperatures.append(float(row[temperature_index]))
temperatures

[81.1,
 81.7,
 82.4,
 82.6,
 82.4,
 80.0,
 77.7,
 79.4,
 81.5,
 80.2,
 78.6,
 81.4,
 81.7,
 80.7,
 81.4,
 81.2,
 82.2,
 80.6,
 74.1,
 79.0,
 79.6,
 79.5,
 77.7,
 80.8,
 79.6,
 80.8,
 80.6,
 80.9,
 80.5,
 81.5,
 79.5,
 80.8,
 79.6,
 77.2,
 75.5,
 75.6,
 75.8,
 79.0,
 80.2,
 80.5,
 80.5,
 81.8,
 79.9,
 76.9,
 81.5,
 81.8,
 82.4,
 80.1,
 81.8,
 81.1,
 77.9,
 78.8,
 80.6,
 81.4,
 81.3,
 76.2,
 80.6,
 81.2,
 81.4,
 79.9,
 77.4,
 80.8,
 81.6,
 81.7,
 81.6,
 81.8,
 81.5,
 81.8,
 82.1,
 82.8,
 80.2,
 82.2,
 82.3,
 83.2,
 83.5,
 83.1,
 81.7,
 82.3,
 83.6,
 82.8,
 81.0,
 82.0,
 84.5,
 81.4,
 82.1,
 80.0,
 83.6,
 80.3,
 80.4,
 81.4,
 81.7,
 84.0,
 84.7,
 81.3,
 83.3,
 82.1,
 81.6,
 83.2,
 84.3,
 82.1,
 81.5,
 81.5,
 84.2,
 82.7,
 83.2,
 83.5,
 81.5,
 84.1,
 82.0,
 82.0,
 79.2,
 83.6,
 84.4,
 82.3,
 81.7,
 80.3,
 82.4,
 79.4,
 83.3,
 82.1,
 80.5,
 82.3,
 80.7,
 81.1,
 81.9,
 83.8,
 82.9,
 82.5,
 82.3,
 81.3,
 81.7,
 85.6,
 85.2,
 85.5,
 85.9,
 84.3,
 86.1,
 84.0,
 84.1,
 81.7,
 82.7,
 85.0,
 83.2,

Find Mean:
    Divide sum of temperatures by count

In [6]:
mean_temp = sum(temperatures) / len(temperatures)
mean_temp

81.65440319447441

Find Median:
    Sort all temps
    Find median temp depend on if the count is odd or even number

In [7]:
sorted_temps = sorted(temperatures)
n = len(sorted_temps)
if n % 2 == 0:
    median_temp = (sorted_temps[n // 2 - 1] + sorted_temps[n // 2]) / 2
else:
    median_temp = sorted_temps[n // 2]
median_temp

82.3

Find Mode:
    Find unique temp
    then, count unique temp(s) with most occurences. 

In [8]:
frequency = {}
for temp in temperatures:
    if temp in frequency:
        frequency[temp] += 1
    else:
        frequency[temp] = 1

max_count = 0
modes = []
for temp, count in frequency.items():
    if count > max_count:
        max_count = count
        modes = [temp]  
    elif count == max_count:
        modes.append(temp)  
modes

[81.7, 83.5]

<u>VISUALIZATION<u>

Create a new annual average temperatures dataframe

In [9]:
average_df = df.groupby('Year', as_index=False)['AvgTemperature'].mean()
average_df['AvgTemperature'] = average_df['AvgTemperature'].round(2)
average_df

Unnamed: 0,Year,AvgTemperature
0,1995,81.76
1,1996,81.64
2,1997,82.96
3,1998,81.12
4,1999,81.11
5,2000,81.57
6,2001,81.71
7,2002,80.62
8,2003,81.58
9,2004,82.24


Turn the data in to list for visualisation input

In [10]:
temp_list = {
    'Year': average_df['Year'].tolist(),
    'AvgTemperature': average_df['AvgTemperature'].tolist()
}

temp_list

{'Year': [1995,
  1996,
  1997,
  1998,
  1999,
  2000,
  2001,
  2002,
  2003,
  2004,
  2005,
  2006,
  2007,
  2008,
  2009,
  2010,
  2011,
  2012,
  2013,
  2014,
  2015,
  2016,
  2017,
  2018,
  2019,
  2020],
 'AvgTemperature': [81.76,
  81.64,
  82.96,
  81.12,
  81.11,
  81.57,
  81.71,
  80.62,
  81.58,
  82.24,
  82.51,
  82.04,
  81.16,
  81.07,
  81.88,
  82.68,
  81.75,
  81.71,
  81.92,
  81.86,
  81.98,
  81.3,
  81.99,
  80.35,
  80.32,
  83.06]}

Crate a sparkline function:
    Because the temp hover around >80 degree (F; ~26 C), the sparkline is used to represent any 10 bps above 80 degree. For example, 1997 and 2020 are the hottest years on average.

In [11]:
def sparkline(temp_list):
    years_list = temp_list['Year']
    value_list = temp_list['AvgTemperature']

    sparkline = {}
    for i in range(len(years_list)):
        year = years_list[i]
        temp = value_list[i]
        
        if temp > 80:
            above_80 = (temp - 80) * 100  # if use for other dataset will need adjustment
            num_asterisks = int(above_80 / 10)
            sparkline[year] = '*' * num_asterisks
        else:
            sparkline[year] = ''
    
    return sparkline

chart = sparkline(temp_list)

for year, line in chart.items():
    print(f"{year}: {line}")


1995: *****************
1996: ****************
1997: *****************************
1998: ***********
1999: ***********
2000: ***************
2001: *****************
2002: ******
2003: ***************
2004: **********************
2005: *************************
2006: ********************
2007: ***********
2008: **********
2009: ******************
2010: **************************
2011: *****************
2012: *****************
2013: *******************
2014: ******************
2015: *******************
2016: ************
2017: *******************
2018: ***
2019: ***
2020: ******************************
