Capstone Two Data Wrangling

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import datetime

In [2]:
df = pd.read_csv('GlobalLandTemperaturesByCity.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [4]:
df.shape

(8599212, 7)

In [5]:
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [6]:
#Convert date column to datetime
df['dt'] = pd.to_datetime(df['dt'], errors='coerce')

In [7]:
#Convert date column to Month-Day-Year
df['dt'] = df['dt'].dt.strftime('%m-%d-%Y')

In [8]:
#Rename Column
df.rename(columns={'dt':'Date'},inplace=True)

In [9]:
#Confirm column name change
df.head()

Unnamed: 0,Date,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,11-01-1743,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,12-01-1743,,,Århus,Denmark,57.05N,10.33E
2,01-01-1744,,,Århus,Denmark,57.05N,10.33E
3,02-01-1744,,,Århus,Denmark,57.05N,10.33E
4,03-01-1744,,,Århus,Denmark,57.05N,10.33E


In [10]:
#Check the data type of the Date column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   Date                           object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [11]:
#Unique values for each column
df.nunique()

Date                               3239
AverageTemperature               111994
AverageTemperatureUncertainty     10902
City                               3448
Country                             159
Latitude                             73
Longitude                          1227
dtype: int64

In [12]:
#The Count and Percentage of missing data per column
missing_data = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis =1)
missing_data.columns=['count','%']
missing_data.sort_values(by='count',ascending=False)

Unnamed: 0,count,%
AverageTemperature,364130,4.234458
AverageTemperatureUncertainty,364130,4.234458
Date,0,0.0
City,0,0.0
Country,0,0.0
Latitude,0,0.0
Longitude,0,0.0


In [20]:
#Finding all the rows with missing Average Temperature data
null_values = pd.isnull(df['AverageTemperature'])

df[null_values]

Unnamed: 0,Date,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
1,12-01-1743,,,Århus,Denmark,57.05N,10.33E
2,01-01-1744,,,Århus,Denmark,57.05N,10.33E
3,02-01-1744,,,Århus,Denmark,57.05N,10.33E
4,03-01-1744,,,Århus,Denmark,57.05N,10.33E
9,08-01-1744,,,Århus,Denmark,57.05N,10.33E
...,...,...,...,...,...,...,...
8596076,06-01-1752,,,Zwolle,Netherlands,52.24N,5.26E
8596077,07-01-1752,,,Zwolle,Netherlands,52.24N,5.26E
8596078,08-01-1752,,,Zwolle,Netherlands,52.24N,5.26E
8596079,09-01-1752,,,Zwolle,Netherlands,52.24N,5.26E


In [13]:
#Summary Statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AverageTemperature,8235082.0,16.727433,10.353442,-42.704,10.299,18.831,25.21,39.651
AverageTemperatureUncertainty,8235082.0,1.028575,1.129733,0.034,0.337,0.591,1.349,15.396


In [14]:
#These are the coldest countries by average
country_average = df.groupby(['Country']).mean()
country_average.sort_values(by='AverageTemperature',ascending = True).head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Mongolia,-3.365485,1.252115
Iceland,1.500089,1.62937
Russia,3.347268,1.365189
Norway,3.612553,1.741349
Finland,3.711645,1.603867


In [15]:
#These are the hottest countries by average
country_average.sort_values(by='AverageTemperature',ascending = False).head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Djibouti,29.15279,0.923249
Niger,28.145552,0.799865
Sudan,28.072831,0.828664
Burkina Faso,27.815295,0.763835
Mali,27.590491,0.80386
