# Importing Modules 

In [1]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col , udf
import pyspark.sql.functions as f

from pyspark.sql.functions import year, month, dayofmonth
import datetime

# Creating SparkSession

In [2]:
spark=SparkSession.builder\
    .master('local') \
    .appName('Udacity') \
    .getOrCreate()

In [3]:
spark

# Loading and Inspsecting Dataframe

In [4]:
df = spark.read.csv('data/GlobalLandTemperaturesByCity.csv',header=True)

In [5]:
df.printSchema()

root
 |-- dt: string (nullable = true)
 |-- AverageTemperature: string (nullable = true)
 |-- AverageTemperatureUncertainty: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)



In [6]:
df.show()

+----------+-------------------+-----------------------------+-----+-------+--------+---------+
|        dt| AverageTemperature|AverageTemperatureUncertainty| City|Country|Latitude|Longitude|
+----------+-------------------+-----------------------------+-----+-------+--------+---------+
|1743-11-01|              6.068|           1.7369999999999999|Århus|Denmark|  57.05N|   10.33E|
|1743-12-01|               null|                         null|Århus|Denmark|  57.05N|   10.33E|
|1744-01-01|               null|                         null|Århus|Denmark|  57.05N|   10.33E|
|1744-02-01|               null|                         null|Århus|Denmark|  57.05N|   10.33E|
|1744-03-01|               null|                         null|Århus|Denmark|  57.05N|   10.33E|
|1744-04-01| 5.7879999999999985|           3.6239999999999997|Århus|Denmark|  57.05N|   10.33E|
|1744-05-01|             10.644|           1.2830000000000001|Århus|Denmark|  57.05N|   10.33E|
|1744-06-01| 14.050999999999998|        

## Cleaning Weather Dataset

+ ~~Filter on latest Year and Month~~ 
+ ~~Dropping Nulls in Temperature~~ 
+ ~~Checking for Duplictes~~
+ ~~Group By Country~~
+ ~~Average per Country~~ 

In [7]:
# Create Month and Year
df=df.withColumn('Month',month('dt'))\
    .withColumn('Year',year('dt'))

In [8]:
# Latest Year
df.groupBy('Year').count().orderBy(col('Year').desc()).show()

+----+-----+
|Year|count|
+----+-----+
|2013|31590|
|2012|42120|
|2011|42120|
|2010|42120|
|2009|42120|
|2008|42120|
|2007|42120|
|2006|42120|
|2005|42120|
|2004|42120|
|2003|42120|
|2002|42120|
|2001|42120|
|2000|42120|
|1999|42120|
|1998|42120|
|1997|42120|
|1996|42120|
|1995|42120|
|1994|42120|
+----+-----+
only showing top 20 rows



In [9]:
# Latest Year
latest_year=df.groupBy('Year').count().orderBy(col('Year').desc()).collect()[0][0]

In [10]:
list_of_years=[]

for year in range(10):
    list_of_years.append(latest_year-year)

In [11]:
# Filter Year 
df_latest=df.filter(col('Year').isin(list_of_years))

In [12]:
df_latest.printSchema()

root
 |-- dt: string (nullable = true)
 |-- AverageTemperature: string (nullable = true)
 |-- AverageTemperatureUncertainty: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Year: integer (nullable = true)



In [22]:
# Filter on USA 
df_clean=df_latest.filter(col('Country')=="United States")

In [23]:
# Filter if Null and drop duplicates
df_clean=df_clean.filter(col('AverageTemperature').isNotNull())\
    .drop_duplicates(subset=['City','Country','Latitude','Longitude'])

In [24]:
# Group by Country
df_grp=df_clean.select('City','AverageTemperature','AverageTemperatureUncertainty').groupBy('City').agg(f.avg('AverageTemperature'),f.avg('AverageTemperatureUncertainty'))

In [26]:
 # Write Data as a csv locally 
 df_grp.repartition(1).write.csv('processed/weather.csv',mode='append',header=True)