### Import neccessary libraries here

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

# Let's load our csv in...
### Because it is > 100 MB, we have to store it on our local machine instead of github

In [4]:
ct = pd.read_csv("C:/Users/conor/Desktop/city_temperature/city_temperature.csv")
ct.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9


In [25]:
ct.tail(5)

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8
2906326,North America,US,Additional Territories,San Juan Puerto Rico,7,31,2013,83.6


### First, lets see how many rows and columns we have.

In [7]:
#.shape is an attribute of DataFrames that returns (#rows, #columns)
ct.shape

(2906327, 8)

### Next, lets check for any null values in the rows.


In [5]:
#check if any cell in the column is a null value
ct.isna().any()

Region            False
Country           False
State              True
City              False
Month             False
Day               False
Year              False
AvgTemperature    False
dtype: bool

### It looks like we have NaN values in the 'State' column. We known how many rows exist in the data set so let's find out how many rows in 'State' are NaN

In [6]:
#sum the null values in each column
ct.isna().sum()

Region                  0
Country                 0
State             1450990
City                    0
Month                   0
Day                     0
Year                    0
AvgTemperature          0
dtype: int64

### Let's look at those rows where the State column is not null

In [11]:
#subset our ct datset by where the state column is not null
ct[ct['State'].notna()]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1450990,North America,US,Alabama,Birmingham,1,1,1995,50.7
1450991,North America,US,Alabama,Birmingham,1,2,1995,37.2
1450992,North America,US,Alabama,Birmingham,1,3,1995,33.2
1450993,North America,US,Alabama,Birmingham,1,4,1995,33.3
1450994,North America,US,Alabama,Birmingham,1,5,1995,26.4
...,...,...,...,...,...,...,...,...
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8


### Hmm it looks like both the head and tail of our subset are both North America... lets see if there are any areas where the region isn't North America and the State is not null

In [13]:
#subset for two conditions now (we need to enclose each condition in parentheses in order to do this)
ct[(ct['Region'] != 'North America') & (ct['State'].notna())]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature


### Next, let's make sure out Months, Days, and Years are all valid

In [15]:
#subset the dataset by rows where the month is between 1 and 12, then lets just return shape to make sure the # of rows is
#same as our original shape
ct[ct['Month'].isin(range(1,13))].shape

(2906327, 8)

In [20]:
#Let's do the same for days, making sure they are between 1 and 31
ct[ct['Day'].isin(range(1,32))].shape

(2906319, 8)

### Hmm... there are 8 rows where the day is not between 1 and 31. Let's take a closer look at those

In [22]:
#by placing a tilde (~) before a condition, it reverses the condition and only selects where the condition is false 
ct[~ct['Day'].isin(range(1,32))]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
82774,Africa,Guinea,,Conakry,3,0,2008,-99.0
85697,Africa,Guinea,,Conakry,3,0,2016,-99.0
92041,Africa,Guinea-Bissau,,Bissau,3,0,2008,-99.0
146077,Africa,Malawi,,Lilongwe,3,0,2012,-99.0
177862,Africa,Nigeria,,Lagos,3,0,2008,-99.0
241159,Africa,Uganda,,Kampala,3,0,2012,-99.0
1209901,North America,Mexico,,Guadalajara,3,0,2012,-99.0
1333910,South/Central America & Carribean,Cuba,,Havana,3,0,2008,-99.0


### Interesting. These days are equal to 0 and the Average Temperature is -99.0! We will revisit this after finishing our date time conditions.

In [26]:
#lets sort our dataframe by hear, not specifying tail or head so that we get a snapshot of the earliest and latest years
ct.sort_values('Year')

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
743771,Europe,Germany,,Hamburg,12,12,200,-99.0
845936,Europe,Norway,,Oslo,12,28,200,-99.0
845935,Europe,Norway,,Oslo,12,27,200,-99.0
845934,Europe,Norway,,Oslo,12,26,200,-99.0
845933,Europe,Norway,,Oslo,12,25,200,-99.0
...,...,...,...,...,...,...,...,...
2593753,North America,US,South Dakota,Sioux Falls,4,3,2020,23.5
2593752,North America,US,South Dakota,Sioux Falls,4,2,2020,44.3
2593751,North America,US,South Dakota,Sioux Falls,4,1,2020,52.4
794360,Europe,Ireland,,Dublin,1,10,2020,41.3


In [29]:
#We saw that the earliest year was 200... and mysteriously the average temperature for those was also -99.0!
ct[ct['Year'].isin(range(1995,2021))].shape

(2905887, 8)

In [30]:
#So there are lots of values missing now, lets take a look at those
ct[~ct['Year'].isin(range(1995,2021))]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
58178,Africa,Ethiopia,,Addis Ababa,12,3,201,-99.0
58179,Africa,Ethiopia,,Addis Ababa,12,4,201,-99.0
58180,Africa,Ethiopia,,Addis Ababa,12,5,201,-99.0
58181,Africa,Ethiopia,,Addis Ababa,12,6,201,-99.0
58182,Africa,Ethiopia,,Addis Ababa,12,7,201,-99.0
...,...,...,...,...,...,...,...,...
1212427,North America,Mexico,,Guadalajara,12,27,201,-99.0
1212428,North America,Mexico,,Guadalajara,12,28,201,-99.0
1212429,North America,Mexico,,Guadalajara,12,29,201,-99.0
1212430,North America,Mexico,,Guadalajara,12,30,201,-99.0


In [39]:
#Again, lots of -99.0 average temperatures, with lots in places that are notoriously hot. Lets see if there are any places
#where the year is out of range and the temperature isn't -99.0.
ct[(~ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] != -99.0)]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature


### So there are 440 rows where the year isn't in our desired range, and surpisingly the average temperature for all of those values is -99.0. Let's apply the same logic we did to our date time columns to our AvgTemperature columns to make sure those values are all logical.

In [42]:
ct.sort_values('AvgTemperature')

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1377340,South/Central America & Carribean,Guyana,,Georgetown,8,21,2005,-99.0
88779,Africa,Guinea-Bissau,,Bissau,3,27,1999,-99.0
88778,Africa,Guinea-Bissau,,Bissau,3,26,1999,-99.0
88777,Africa,Guinea-Bissau,,Bissau,3,25,1999,-99.0
88776,Africa,Guinea-Bissau,,Bissau,3,24,1999,-99.0
...,...,...,...,...,...,...,...,...
1034963,Middle East,Kuwait,,Kuwait,8,2,2012,109.9
1036756,Middle East,Kuwait,,Kuwait,6,29,2017,109.9
1032390,Middle East,Kuwait,,Kuwait,7,17,2005,109.9
1036042,Middle East,Kuwait,,Kuwait,7,17,2015,109.9


### 110.0 Fahrenheit is a believable temperature but -99.0 Fahrenheit in August in Guyana? I'm not sure thats accurate. Let's now grab unique temperature values

In [49]:
#We create a set of values using .unique() and then sort it. The default sorting is ascending
sorted(ct['AvgTemperature'].unique())

[-99.0,
 -50.0,
 -49.1,
 -47.7,
 -46.9,
 -46.2,
 -45.9,
 -45.8,
 -45.6,
 -44.7,
 -44.3,
 -44.1,
 -43.8,
 -43.7,
 -43.5,
 -43.4,
 -43.3,
 -43.0,
 -42.8,
 -42.5,
 -42.4,
 -42.2,
 -42.1,
 -42.0,
 -41.8,
 -41.7,
 -41.6,
 -41.4,
 -41.2,
 -41.1,
 -40.9,
 -40.8,
 -40.7,
 -40.2,
 -40.0,
 -39.9,
 -39.8,
 -39.6,
 -39.5,
 -39.4,
 -39.3,
 -39.2,
 -39.1,
 -39.0,
 -38.9,
 -38.7,
 -38.5,
 -38.4,
 -38.3,
 -38.1,
 -38.0,
 -37.8,
 -37.6,
 -37.4,
 -37.2,
 -37.1,
 -37.0,
 -36.9,
 -36.7,
 -36.5,
 -36.4,
 -36.3,
 -36.2,
 -36.1,
 -36.0,
 -35.9,
 -35.7,
 -35.5,
 -35.4,
 -35.3,
 -35.2,
 -35.1,
 -35.0,
 -34.8,
 -34.7,
 -34.6,
 -34.4,
 -34.3,
 -34.2,
 -34.1,
 -34.0,
 -33.9,
 -33.8,
 -33.7,
 -33.4,
 -33.3,
 -33.2,
 -33.1,
 -33.0,
 -32.9,
 -32.8,
 -32.7,
 -32.6,
 -32.5,
 -32.4,
 -32.3,
 -32.2,
 -32.1,
 -32.0,
 -31.9,
 -31.8,
 -31.7,
 -31.6,
 -31.5,
 -31.4,
 -31.3,
 -31.2,
 -31.1,
 -31.0,
 -30.9,
 -30.8,
 -30.7,
 -30.6,
 -30.5,
 -30.3,
 -30.2,
 -30.1,
 -30.0,
 -29.9,
 -29.8,
 -29.7,
 -29.6,
 -29.5,
 -29.4,
 -29.3,


### Theres a 49.0 degree gap between the lowest number (-99.0) and the second lowest (-50.0). Let's check -50.0 to make sure it seems like a legitimate reading.

In [48]:
#Use .loc (label based lookup) to see the row where AvgTemperature was -50.0
ct.loc[ct['AvgTemperature'] == -50.0]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1499140,North America,US,Alaska,Fairbanks,12,31,1999,-50.0


### Our true lowest record temperature was on December 31st in Fairbanks, Alaska. Sounds good to me!

### So we know there are some days in which day = 0, year = 200 or 201, and AvgTemp = -99.0. Let's remove those from out dataset.

In [51]:
ct_fltrd = ct[(ct['Day'].isin(range(1,32))) & (ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] != -99.0)]
ct_fltrd

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9
...,...,...,...,...,...,...,...,...
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8
