# Proyecto Clima En Ciudades Mayores

In [1]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lat_lon_parser import parse

basedir = './GlobalTemperatures'

## Carga de Datos

In [2]:
clima = os.path.join(basedir, 'GlobalLandTemperaturesbyMajorCity' + '.csv')
dfClima = pd.read_csv(clima)
df = dfClima.copy()
df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.140,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.200,Abidjan,Côte D'Ivoire,5.63N,3.23W
...,...,...,...,...,...,...,...
239172,2013-05-01,18.979,0.807,Xian,China,34.56N,108.97E
239173,2013-06-01,23.522,0.647,Xian,China,34.56N,108.97E
239174,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E
239175,2013-08-01,24.528,0.840,Xian,China,34.56N,108.97E


Nos damos cuenta que nuestra variable dt es un objeto en vez de una fecha, por lo que la convertiremos para poder calcular la temporada del año

In [3]:
dfClima.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

In [4]:
df['dt'] = pd.to_datetime(dfClima['dt'])
df.dtypes, display(df)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.140,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.200,Abidjan,Côte D'Ivoire,5.63N,3.23W
...,...,...,...,...,...,...,...
239172,2013-05-01,18.979,0.807,Xian,China,34.56N,108.97E
239173,2013-06-01,23.522,0.647,Xian,China,34.56N,108.97E
239174,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E
239175,2013-08-01,24.528,0.840,Xian,China,34.56N,108.97E


(dt                               datetime64[ns]
 AverageTemperature                      float64
 AverageTemperatureUncertainty           float64
 City                                     object
 Country                                  object
 Latitude                                 object
 Longitude                                object
 dtype: object,
 None)

Una vez que ya lo poseemos como algo de clase fecha es cuando sacaremos el mes para luego reemplazarlo por la temporada

In [5]:
df['Season'] = pd.DatetimeIndex(df['dt']).month
df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,Season
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W,1
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W,2
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W,3
3,1849-04-01,26.140,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W,4
4,1849-05-01,25.427,1.200,Abidjan,Côte D'Ivoire,5.63N,3.23W,5
...,...,...,...,...,...,...,...,...
239172,2013-05-01,18.979,0.807,Xian,China,34.56N,108.97E,5
239173,2013-06-01,23.522,0.647,Xian,China,34.56N,108.97E,6
239174,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E,7
239175,2013-08-01,24.528,0.840,Xian,China,34.56N,108.97E,8


Usamos un diccionario de valores para reemplazar los valores de los meses por las temporadas

In [6]:
valores_unicos = df.Season.unique().tolist()
encoded_values = {}
for i in valores_unicos:
    if i <= 3:
        temporada = 'Invierno'
    elif i <= 6:
        temporada = 'Primavera'
    elif i <= 9:
        temporada = 'Verano'
    else:
        temporada = 'Otoño'

    encoded_values[i] = temporada

df['Season'].replace(encoded_values, inplace = True)
df.dropna(inplace = True)
df['Latitude'] = df['Latitude'].apply(lambda x:  parse(x))
df['Longitude'] = df['Longitude'].apply(lambda x: parse(x))
df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,Season
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63,-3.23,Invierno
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63,-3.23,Invierno
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63,-3.23,Invierno
3,1849-04-01,26.140,1.387,Abidjan,Côte D'Ivoire,5.63,-3.23,Primavera
4,1849-05-01,25.427,1.200,Abidjan,Côte D'Ivoire,5.63,-3.23,Primavera
...,...,...,...,...,...,...,...,...
239171,2013-04-01,12.563,1.823,Xian,China,34.56,108.97,Primavera
239172,2013-05-01,18.979,0.807,Xian,China,34.56,108.97,Primavera
239173,2013-06-01,23.522,0.647,Xian,China,34.56,108.97,Primavera
239174,2013-07-01,25.251,1.042,Xian,China,34.56,108.97,Verano


Ya teniendo este archivo lo guardamos como csv para usarlo posteriormente

In [7]:
# Esta linea ya se corrio una vez, se queda por documentación
#df.to_csv(os.path.join(basedir, 'GLTbyMC_corrected.csv'), index = False)

## Comprensión de negocio y datos

In [8]:
filename = 'GLTbyMC_corrected.csv'
filepath  = os.path.join(basedir, filename)
data = pd.read_csv(filepath)
data

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,Season
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63,-3.23,Invierno
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63,-3.23,Invierno
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63,-3.23,Invierno
3,1849-04-01,26.140,1.387,Abidjan,Côte D'Ivoire,5.63,-3.23,Primavera
4,1849-05-01,25.427,1.200,Abidjan,Côte D'Ivoire,5.63,-3.23,Primavera
...,...,...,...,...,...,...,...,...
228170,2013-04-01,12.563,1.823,Xian,China,34.56,108.97,Primavera
228171,2013-05-01,18.979,0.807,Xian,China,34.56,108.97,Primavera
228172,2013-06-01,23.522,0.647,Xian,China,34.56,108.97,Primavera
228173,2013-07-01,25.251,1.042,Xian,China,34.56,108.97,Verano


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228175 entries, 0 to 228174
Data columns (total 8 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   dt                             228175 non-null  object 
 1   AverageTemperature             228175 non-null  float64
 2   AverageTemperatureUncertainty  228175 non-null  float64
 3   City                           228175 non-null  object 
 4   Country                        228175 non-null  object 
 5   Latitude                       228175 non-null  float64
 6   Longitude                      228175 non-null  float64
 7   Season                         228175 non-null  object 
dtypes: float64(4), object(4)
memory usage: 13.9+ MB
