### Import neccessary libraries here

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

# Let's load our csv in...
### Because it is > 100 MB, we have to store it on our local machine instead of github, you'll have to comment out my directory and put in your own

In [2]:
ct = pd.read_csv("C:/Users/conor/Desktop/city_temperature/city_temperature.csv")
ct.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9


In [3]:
ct.tail(5)

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8
2906326,North America,US,Additional Territories,San Juan Puerto Rico,7,31,2013,83.6


### First, lets see how many rows and columns we have.

In [4]:
#.shape is an attribute of DataFrames that returns (#rows, #columns)
ct.shape

(2906327, 8)

### Next, lets check for any null values in the rows.


In [5]:
#check if any cell in the column is a null value
ct.isna().any()

Region            False
Country           False
State              True
City              False
Month             False
Day               False
Year              False
AvgTemperature    False
dtype: bool

### It looks like we have NaN values in the 'State' column. We known how many rows exist in the data set so let's find out how many rows in 'State' are NaN

In [6]:
#sum the null values in each column
ct.isna().sum()

Region                  0
Country                 0
State             1450990
City                    0
Month                   0
Day                     0
Year                    0
AvgTemperature          0
dtype: int64

### Let's look at those rows where the State column is not null

In [7]:
#subset our ct datset by where the state column is not null
ct[ct['State'].notna()]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1450990,North America,US,Alabama,Birmingham,1,1,1995,50.7
1450991,North America,US,Alabama,Birmingham,1,2,1995,37.2
1450992,North America,US,Alabama,Birmingham,1,3,1995,33.2
1450993,North America,US,Alabama,Birmingham,1,4,1995,33.3
1450994,North America,US,Alabama,Birmingham,1,5,1995,26.4
...,...,...,...,...,...,...,...,...
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8


### Hmm it looks like both the head and tail of our subset are both North America... lets see if there are any areas where the region isn't North America and the State is not null

In [8]:
#subset for two conditions now (we need to enclose each condition in parentheses in order to do this)
ct[(ct['Region'] != 'North America') & (ct['State'].notna())]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature


### Next, let's make sure out Months, Days, and Years are all valid

In [9]:
#subset the dataset by rows where the month is between 1 and 12, then lets just return shape to make sure the # of rows is
#same as our original shape
ct[ct['Month'].isin(range(1,13))].shape

(2906327, 8)

In [10]:
#Let's do the same for days, making sure they are between 1 and 31
ct[ct['Day'].isin(range(1,32))].shape

(2906319, 8)

### Hmm... there are 8 rows where the day is not between 1 and 31. Let's take a closer look at those

In [11]:
#by placing a tilde (~) before a condition, it reverses the condition and only selects where the condition is false 
ct[~ct['Day'].isin(range(1,32))]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
82774,Africa,Guinea,,Conakry,3,0,2008,-99.0
85697,Africa,Guinea,,Conakry,3,0,2016,-99.0
92041,Africa,Guinea-Bissau,,Bissau,3,0,2008,-99.0
146077,Africa,Malawi,,Lilongwe,3,0,2012,-99.0
177862,Africa,Nigeria,,Lagos,3,0,2008,-99.0
241159,Africa,Uganda,,Kampala,3,0,2012,-99.0
1209901,North America,Mexico,,Guadalajara,3,0,2012,-99.0
1333910,South/Central America & Carribean,Cuba,,Havana,3,0,2008,-99.0


### Interesting. These days are equal to 0 and the Average Temperature is -99.0! We will revisit this after finishing our date time conditions.

In [12]:
#lets sort our dataframe by hear, not specifying tail or head so that we get a snapshot of the earliest and latest years
ct.sort_values('Year')

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
743771,Europe,Germany,,Hamburg,12,12,200,-99.0
845936,Europe,Norway,,Oslo,12,28,200,-99.0
845935,Europe,Norway,,Oslo,12,27,200,-99.0
845934,Europe,Norway,,Oslo,12,26,200,-99.0
845933,Europe,Norway,,Oslo,12,25,200,-99.0
...,...,...,...,...,...,...,...,...
2593753,North America,US,South Dakota,Sioux Falls,4,3,2020,23.5
2593752,North America,US,South Dakota,Sioux Falls,4,2,2020,44.3
2593751,North America,US,South Dakota,Sioux Falls,4,1,2020,52.4
794360,Europe,Ireland,,Dublin,1,10,2020,41.3


In [13]:
#We saw that the earliest year was 200... and mysteriously the average temperature for those was also -99.0!
ct[ct['Year'].isin(range(1995,2021))].shape

(2905887, 8)

In [14]:
#So there are lots of values missing now, lets take a look at those
ct[~ct['Year'].isin(range(1995,2021))]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
58178,Africa,Ethiopia,,Addis Ababa,12,3,201,-99.0
58179,Africa,Ethiopia,,Addis Ababa,12,4,201,-99.0
58180,Africa,Ethiopia,,Addis Ababa,12,5,201,-99.0
58181,Africa,Ethiopia,,Addis Ababa,12,6,201,-99.0
58182,Africa,Ethiopia,,Addis Ababa,12,7,201,-99.0
...,...,...,...,...,...,...,...,...
1212427,North America,Mexico,,Guadalajara,12,27,201,-99.0
1212428,North America,Mexico,,Guadalajara,12,28,201,-99.0
1212429,North America,Mexico,,Guadalajara,12,29,201,-99.0
1212430,North America,Mexico,,Guadalajara,12,30,201,-99.0


In [15]:
#Again, lots of -99.0 average temperatures, with lots in places that are notoriously hot. Lets see if there are any places
#where the year is out of range and the temperature isn't -99.0.
ct[(~ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] != -99.0)]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature


### So there are 440 rows where the year isn't in our desired range, and surpisingly the average temperature for all of those values is -99.0. Let's apply the same logic we did to our date time columns to our AvgTemperature columns to make sure those values are all logical.

In [16]:
ct.sort_values('AvgTemperature')

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1377340,South/Central America & Carribean,Guyana,,Georgetown,8,21,2005,-99.0
88779,Africa,Guinea-Bissau,,Bissau,3,27,1999,-99.0
88778,Africa,Guinea-Bissau,,Bissau,3,26,1999,-99.0
88777,Africa,Guinea-Bissau,,Bissau,3,25,1999,-99.0
88776,Africa,Guinea-Bissau,,Bissau,3,24,1999,-99.0
...,...,...,...,...,...,...,...,...
1034963,Middle East,Kuwait,,Kuwait,8,2,2012,109.9
1036756,Middle East,Kuwait,,Kuwait,6,29,2017,109.9
1032390,Middle East,Kuwait,,Kuwait,7,17,2005,109.9
1036042,Middle East,Kuwait,,Kuwait,7,17,2015,109.9


### 110.0 Fahrenheit is a believable temperature but -99.0 Fahrenheit in August in Guyana? I'm not sure thats accurate. Let's now grab unique temperature values

In [17]:
#We create a set of values using .unique() and then sort it. The default sorting is ascending
sorted(ct['AvgTemperature'].unique())[:5]

[-99.0, -50.0, -49.1, -47.7, -46.9]

### Theres a 49.0 degree gap between the lowest number (-99.0) and the second lowest (-50.0). Let's check -50.0 to make sure it seems like a legitimate reading.

In [18]:
#Use .loc (label based lookup) to see the row where AvgTemperature was -50.0
ct.loc[ct['AvgTemperature'] == -50.0]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1499140,North America,US,Alaska,Fairbanks,12,31,1999,-50.0


### Our true lowest record temperature was on December 31st in Fairbanks, Alaska. Sounds good to me!

### So we know there are some days in which day = 0, year = 200 or 201, and AvgTemp = -99.0. Let's remove those from out dataset.

In [19]:
ct_fltrd = ct[(ct['Day'].isin(range(1,32))) & (ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] != -99.0)]
ct_fltrd

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9
...,...,...,...,...,...,...,...,...
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8


### Lets check all of the scenarios in which our data is out of range or values are incorrect to make sure we didn't lose any more/less rows than we expected to

In [20]:
#Day wrong, Year and Temp in range
print(ct[(~ct['Day'].isin(range(1,32))) & (ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] != -99.0)].shape)
#Year wrong, Day and Temp in range
print(ct[(ct['Day'].isin(range(1,32))) & (~ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] != -99.0)].shape)
#AvgTemp wrong, Day and year in range
print(ct[(ct['Day'].isin(range(1,32))) & (ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] == -99.0)].shape)
#Day and AvgTemp wrong, Year in range
print(ct[~(ct['Day'].isin(range(1,32))) & (ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] == -99.0)].shape)
#Day and Year wrong, Temp in range
print(ct[~(ct['Day'].isin(range(1,32))) & (~ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] != -99.0)].shape)
#Year and Avg Temp wrong, Day in range
print(ct[(ct['Day'].isin(range(1,32))) & (~ct['Year'].isin(range(1995,2021))) & (ct['AvgTemperature'] == -99.0)].shape)

(0, 8)
(0, 8)
(79224, 8)
(8, 8)
(0, 8)
(440, 8)


### That totals to 79,224 rows where our AvgTemp = -99.0,  8 rows where our day = 0 and our AvgTemp = -99.0 and 440 rows where the year = 200 or 201 and the AvgTemp = -99.0

In [21]:
#Let's sum the rows from the invalid cases
print((79224+8+440))

79672


In [22]:
#Lets use shape to determine how many rows we lost after we filtered the data
ct.shape[0]-ct_fltrd.shape[0]

79672

### Perfect. Our filtered data is the exact size that we expected it to be.

### Before we go on, lets take a look at individual incorrect data cells. Are these cells fixable or is filtering them out the right move

In [23]:
#Let's take a look at one example of an incorrect data point and the data values that are next to it
#I used the index position from the AvgTemperature sorted sheet
ct.iloc[1377339:1377347,:]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1377339,South/Central America & Carribean,Guyana,,Georgetown,8,20,2005,87.3
1377340,South/Central America & Carribean,Guyana,,Georgetown,8,21,2005,-99.0
1377341,South/Central America & Carribean,Guyana,,Georgetown,8,22,2005,-99.0
1377342,South/Central America & Carribean,Guyana,,Georgetown,8,23,2005,86.2
1377343,South/Central America & Carribean,Guyana,,Georgetown,8,24,2005,82.5
1377344,South/Central America & Carribean,Guyana,,Georgetown,8,25,2005,-99.0
1377345,South/Central America & Carribean,Guyana,,Georgetown,8,26,2005,86.3
1377346,South/Central America & Carribean,Guyana,,Georgetown,8,27,2005,86.3


### Interesting. So we have a variety of issues here. We can't simply take average of the values above and below a -99.0 AvgTemperature reading. Lets apply these thoughts to the day = 0 cells and the year = 200 or 201 cells.

In [24]:
#Let's look at a location from our day=0 list, grabbing a few days before the day0 and the month after
ct.iloc[82770:82807,:]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
82770,Africa,Guinea,,Conakry,2,25,2008,-99.0
82771,Africa,Guinea,,Conakry,2,26,2008,-99.0
82772,Africa,Guinea,,Conakry,2,27,2008,-99.0
82773,Africa,Guinea,,Conakry,2,28,2008,-99.0
82774,Africa,Guinea,,Conakry,3,0,2008,-99.0
82775,Africa,Guinea,,Conakry,3,1,2008,-99.0
82776,Africa,Guinea,,Conakry,3,2,2008,-99.0
82777,Africa,Guinea,,Conakry,3,3,2008,-99.0
82778,Africa,Guinea,,Conakry,3,4,2008,-99.0
82779,Africa,Guinea,,Conakry,3,5,2008,-99.0


### You'll notice from this list, and the list of day = 0's, all day = 0 cells happened on month = 3 and in a leap year... Let's investigate each day = 0 and check to see if the day before was February 28th or February 29th like it should be

In [25]:
#Placing these rows here again for reference
ct[~ct['Day'].isin(range(1,32))]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
82774,Africa,Guinea,,Conakry,3,0,2008,-99.0
85697,Africa,Guinea,,Conakry,3,0,2016,-99.0
92041,Africa,Guinea-Bissau,,Bissau,3,0,2008,-99.0
146077,Africa,Malawi,,Lilongwe,3,0,2012,-99.0
177862,Africa,Nigeria,,Lagos,3,0,2008,-99.0
241159,Africa,Uganda,,Kampala,3,0,2012,-99.0
1209901,North America,Mexico,,Guadalajara,3,0,2012,-99.0
1333910,South/Central America & Carribean,Cuba,,Havana,3,0,2008,-99.0


In [26]:
#We can use iloc to grab the day0 rows and the day before, but we have to use np.r_ to translate slice objects to concats
ct.iloc[np.r_[82773:82775,85696:85698,92040:92042,146076:146078,177861:177863,241158:241160,1209900:1209902,1333909:1333911],:]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
82773,Africa,Guinea,,Conakry,2,28,2008,-99.0
82774,Africa,Guinea,,Conakry,3,0,2008,-99.0
85696,Africa,Guinea,,Conakry,2,28,2016,-99.0
85697,Africa,Guinea,,Conakry,3,0,2016,-99.0
92040,Africa,Guinea-Bissau,,Bissau,2,28,2008,79.8
92041,Africa,Guinea-Bissau,,Bissau,3,0,2008,-99.0
146076,Africa,Malawi,,Lilongwe,2,28,2012,71.4
146077,Africa,Malawi,,Lilongwe,3,0,2012,-99.0
177861,Africa,Nigeria,,Lagos,2,28,2008,-99.0
177862,Africa,Nigeria,,Lagos,3,0,2008,-99.0


### Instead of just filtering these day = 0's out, we can change them to Month = 2 and Day = 29... but let's just make sure there are other February 29th's listed for other cities on leap years

In [27]:
#Subset ct where Month is February and Day is 29
ct[(ct['Month'] == 2) & (ct['Day'] == 29)]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
424,Africa,Algeria,,Algiers,2,29,1996,51.7
1885,Africa,Algeria,,Algiers,2,29,2000,54.7
3346,Africa,Algeria,,Algiers,2,29,2004,48.6
4807,Africa,Algeria,,Algiers,2,29,2008,57.2
6268,Africa,Algeria,,Algiers,2,29,2012,49.2
...,...,...,...,...,...,...,...,...
2899964,North America,US,Additional Territories,San Juan Puerto Rico,2,29,1996,77.4
2901425,North America,US,Additional Territories,San Juan Puerto Rico,2,29,2000,74.5
2902886,North America,US,Additional Territories,San Juan Puerto Rico,2,29,2004,76.0
2904347,North America,US,Additional Territories,San Juan Puerto Rico,2,29,2008,77.4


### We've figured out how we can fix our day = 0 problem so let's continue on to our year = 200 | 201 issue

In [28]:
#First lets grab that table from above again
ct[~ct['Year'].isin(range(1995,2021))]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
58178,Africa,Ethiopia,,Addis Ababa,12,3,201,-99.0
58179,Africa,Ethiopia,,Addis Ababa,12,4,201,-99.0
58180,Africa,Ethiopia,,Addis Ababa,12,5,201,-99.0
58181,Africa,Ethiopia,,Addis Ababa,12,6,201,-99.0
58182,Africa,Ethiopia,,Addis Ababa,12,7,201,-99.0
...,...,...,...,...,...,...,...,...
1212427,North America,Mexico,,Guadalajara,12,27,201,-99.0
1212428,North America,Mexico,,Guadalajara,12,28,201,-99.0
1212429,North America,Mexico,,Guadalajara,12,29,201,-99.0
1212430,North America,Mexico,,Guadalajara,12,30,201,-99.0


### As we can see, the indeces we see here are consecutive so our issue is different than our day = 0 problem. Let's take a closer look at one of these issues

In [29]:
#Let's index it until we can find a defined spot in the dataframe that has incorrect years but are surrounded by correct ones
ct.iloc[58177:58208,:]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
58177,Africa,Ethiopia,,Addis Ababa,12,31,2014,-99.0
58178,Africa,Ethiopia,,Addis Ababa,12,3,201,-99.0
58179,Africa,Ethiopia,,Addis Ababa,12,4,201,-99.0
58180,Africa,Ethiopia,,Addis Ababa,12,5,201,-99.0
58181,Africa,Ethiopia,,Addis Ababa,12,6,201,-99.0
58182,Africa,Ethiopia,,Addis Ababa,12,7,201,-99.0
58183,Africa,Ethiopia,,Addis Ababa,12,8,201,-99.0
58184,Africa,Ethiopia,,Addis Ababa,12,9,201,-99.0
58185,Africa,Ethiopia,,Addis Ababa,12,10,201,-99.0
58186,Africa,Ethiopia,,Addis Ababa,12,11,201,-99.0


### So here it looks like leading into a new year, either we lost the final digit of our date column or the previous few entries were duplicated. Let's investigate another instance of an incorrect date to see if there are any similarities, particularly whether or not the changing of the year has anything to do with it.

In [30]:
ct.iloc[1212400:1212433,:]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
1212400,North America,Mexico,,Guadalajara,12,30,2018,60.2
1212401,North America,Mexico,,Guadalajara,12,30,2018,-99.0
1212402,North America,Mexico,,Guadalajara,12,31,2018,-99.0
1212403,North America,Mexico,,Guadalajara,12,3,201,-99.0
1212404,North America,Mexico,,Guadalajara,12,4,201,-99.0
1212405,North America,Mexico,,Guadalajara,12,5,201,-99.0
1212406,North America,Mexico,,Guadalajara,12,6,201,-99.0
1212407,North America,Mexico,,Guadalajara,12,7,201,-99.0
1212408,North America,Mexico,,Guadalajara,12,8,201,-99.0
1212409,North America,Mexico,,Guadalajara,12,9,201,-99.0


### Hey look at that. This instance also exists in december and goes from the 3rd to the 31st and if we look to the top of the table, these are duplicated rows that have dropped the single digit from year.

### Speaking of which, maybe we should investigate for places where the day is the same for two consecutive entries, or the month is consecutive for, well, over a month straight. That would help us fix this December twice problem and see if that problem exists elsewhere. We are quickly discovering there are many ways to skin a cat. For now, lets look continue to look at the month wide instances of the incorrect year, this time grabbing all instances.

In [33]:
#This is slightly annoying as we want to see all of the rows but in order to do this, we must use a with statement and the
#display max rows from pandas
with pd.option_context("display.max_rows", 500):
    display(ct[~ct['Year'].isin(range(1995,2021))])

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
58178,Africa,Ethiopia,,Addis Ababa,12,3,201,-99.0
58179,Africa,Ethiopia,,Addis Ababa,12,4,201,-99.0
58180,Africa,Ethiopia,,Addis Ababa,12,5,201,-99.0
58181,Africa,Ethiopia,,Addis Ababa,12,6,201,-99.0
58182,Africa,Ethiopia,,Addis Ababa,12,7,201,-99.0
58183,Africa,Ethiopia,,Addis Ababa,12,8,201,-99.0
58184,Africa,Ethiopia,,Addis Ababa,12,9,201,-99.0
58185,Africa,Ethiopia,,Addis Ababa,12,10,201,-99.0
58186,Africa,Ethiopia,,Addis Ababa,12,11,201,-99.0
58187,Africa,Ethiopia,,Addis Ababa,12,12,201,-99.0


In [34]:
#This code does what our last code did but instead of letting use scroll the dataframe to make inferences with our eyes,
#we simply output anywhere that the year isn't in range but the month isn't 12 either
ct[(~ct['Year'].isin(range(1995,2021))) & (ct['Month'] != 12)]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
744886,Europe,Germany,,Hamburg,11,2,201,-99.0
744887,Europe,Germany,,Hamburg,11,3,201,-99.0
744888,Europe,Germany,,Hamburg,11,4,201,-99.0
744889,Europe,Germany,,Hamburg,11,5,201,-99.0
744890,Europe,Germany,,Hamburg,11,6,201,-99.0
744891,Europe,Germany,,Hamburg,11,7,201,-99.0
744892,Europe,Germany,,Hamburg,11,8,201,-99.0
744893,Europe,Germany,,Hamburg,11,9,201,-99.0
744894,Europe,Germany,,Hamburg,11,10,201,-99.0
744895,Europe,Germany,,Hamburg,11,11,201,-99.0


In [42]:
#Let's now look at this iloc slice
ct.iloc[744880:744940,:]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
744880,Europe,Germany,,Hamburg,11,25,2011,44.3
744881,Europe,Germany,,Hamburg,11,26,2011,43.7
744882,Europe,Germany,,Hamburg,11,27,2011,-99.0
744883,Europe,Germany,,Hamburg,11,28,2011,42.1
744884,Europe,Germany,,Hamburg,11,29,2011,40.0
744885,Europe,Germany,,Hamburg,11,29,2011,-99.0
744886,Europe,Germany,,Hamburg,11,2,201,-99.0
744887,Europe,Germany,,Hamburg,11,3,201,-99.0
744888,Europe,Germany,,Hamburg,11,4,201,-99.0
744889,Europe,Germany,,Hamburg,11,5,201,-99.0


In [44]:
#Hmm this spot has incorrect years and temperatures for consecutive months, let's see if the real december is located after
#the "fake" december
ct.iloc[744940:744980,:]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
744940,Europe,Germany,,Hamburg,12,26,201,-99.0
744941,Europe,Germany,,Hamburg,12,27,201,-99.0
744942,Europe,Germany,,Hamburg,12,28,201,-99.0
744943,Europe,Germany,,Hamburg,12,29,201,-99.0
744944,Europe,Germany,,Hamburg,12,30,201,-99.0
744945,Europe,Germany,,Hamburg,12,31,201,-99.0
744946,Europe,Germany,,Hamburg,1,1,2011,-99.0
744947,Europe,Germany,,Hamburg,1,2,2011,-99.0
744948,Europe,Germany,,Hamburg,1,3,2011,28.2
744949,Europe,Germany,,Hamburg,1,4,2011,33.2


### Okay so this instance has one full month missing (Decemeber) and one full month repeated (November). If we were to simply take the previous temperature value where the value was not -99.0, we would end up with a temperature reading on December 31st that was equal to the reading all the way back on November 29th which wouldn't be very accurate. I think a good measure would be to take the average of the reading on November 29th and the next accurate reading, January 3rd for all of the dates in between.