# Climate Change and CO2 Analysis
This notebook analyzes the relationship between CO2 levels and temperature changes across different geographical regions.

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from subprocess import check_output

## Load Datasets

In [None]:
CO2_df = pd.read_csv('./data/archive.csv')
temp_by_city = pd.read_csv('./data/ClimateChange/GlobalLandTemperaturesByCity.csv')
temp_by_country = pd.read_csv('./data/ClimateChange/TemperaturesByCountry.csv')
temp_by_major_city = pd.read_csv('./data/ClimateChange/TemperaturesByMajorCity.csv')
temp_by_state = pd.read_csv('./data/ClimateChange/TemperaturesByState.csv')
global_temp = pd.read_csv('./data/ClimateChange/GlobalTemperatures.csv')

## Explore CO2 Data

In [None]:
CO2_df.head()

## Explore Temperature Data

In [None]:
temp_by_city.head()

In [None]:
temp_by_country.head()

In [None]:
temp_by_major_city.head()

In [None]:
temp_by_state.head()

In [None]:
global_temp.head()

## Data Cleaning - Remove NaN Values

In [None]:
CO2_df = CO2_df.dropna()
temp_by_city = temp_by_city.dropna()
temp_by_country = temp_by_country.dropna()
temp_by_major_city = temp_by_major_city.dropna()
temp_by_state = temp_by_state.dropna()
global_temp = global_temp.dropna()

## Helper Functions for Date Processing

In [None]:
def to_year(date):
    """
    returns year from date time
    """
    for i in [date]:
        first = i.split('-')[0]
        return int(first)

## Analyze United States Temperature Data

In [None]:
# Applying the date modification and creating a new column 
# in the dataset called year
temp_by_state['year'] = temp_by_state['dt'].apply(to_year)

# Collecting only data of the United States
dfs = temp_by_state[temp_by_state['Country'] == 'United States']

# Collecting the average temperature per year 
dfa = pd.DataFrame()
years = dfs['year'].unique()
for i in years:
    df_avg = dfs[dfs['year'] == i]['AverageTemperature'].mean()
    df_new = (dfs[dfs['year'] == i]).head(1)
    df_new['AverageTemperature'] = df_avg
    dfa = dfa.append(df_new)

## Visualize Temperature Distributions

In [None]:
# Average Temperature above 9 degrees
df_nine = dfa[dfa['AverageTemperature'] >= 9]
df_nine.plot.scatter(x='year', y='AverageTemperature', c='AverageTemperature', cmap='coolwarm')

In [None]:
# Average Temperature below 9 degrees
df_nine = dfa[dfa['AverageTemperature'] <= 9]
df_nine.plot.scatter(x='year', y='AverageTemperature', c='AverageTemperature', cmap='coolwarm')

## Process CO2 Data by Year

In [None]:
new_co2_df = CO2_df
dfc = pd.DataFrame()

# Collecting all of the unique years
years = CO2_df['Year'].unique()

for i in years:
    df_avg = CO2_df[CO2_df['Year'] == i]['Carbon Dioxide (ppm)'].mean()
    df_new = (CO2_df[CO2_df['Year'] == i]).head(1)
    df_new['Carbon Dioxide (ppm)'] = df_avg
    dfc = dfc.append(df_new)
    
# Changing the Year column to year (lowercase)
dfc.rename(index=str, columns={"Year": "year"}, inplace=True)

# Dropping all of the unwanted columns
dfc.drop(['Seasonally Adjusted CO2 (ppm)', 
           'Carbon Dioxide Fit (ppm)', 
           'Seasonally Adjusted CO2 Fit (ppm)',
          'Decimal Date',
          'Month'], inplace=True, axis=1)
dfc = dfc.dropna()

## Visualize CO2 Trends

In [None]:
sns.lmplot(x='year', y='Carbon Dioxide (ppm)', data=dfc)

## Correlation Analysis

In [None]:
sns.heatmap(dfc.corr())

In [None]:
sns.heatmap(global_temp.corr())

In [None]:
sns.heatmap(temp_by_state.corr())

In [None]:
sns.heatmap(temp_by_major_city.corr())

In [None]:
sns.heatmap(temp_by_country.corr())

## Merge Temperature and CO2 Data

In [None]:
CO2_df.rename(index=str, columns={"Year": "year"}, inplace=True)
dfsc = pd.merge(dfa, CO2_df, on=['year']).dropna()

dfsc.drop(['Seasonally Adjusted CO2 (ppm)', 
           'Carbon Dioxide Fit (ppm)', 
           'Seasonally Adjusted CO2 Fit (ppm)',
          'Decimal Date',
          'Month'], inplace=True, axis=1)

In [None]:
sns.lmplot(x='AverageTemperature', y='Carbon Dioxide (ppm)', data=dfsc)

## Interactive Visualizations with Plotly

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
import datetime as dt

In [None]:
grp1 = CO2_df.groupby(["year"]).mean()["Carbon Dioxide (ppm)"]
trace1 = go.Bar(x=grp1.index, y=grp1.values)
layout = go.Layout(
    title="Average CO<sub>2</sub> Levels in Atmosphere per month",
    yaxis=dict(title="Parts per million (PPM)", range=(300,420)),
    xaxis=dict(title="Year"))
figure = go.Figure(data=[trace1], layout=layout)
py.iplot(figure, filename="co2-ppm-year")

## Seasonal CO2 Fluctuations

In [None]:
group2 = CO2_df.groupby(["year", "Month"]).mean()["Carbon Dioxide (ppm)"]
x = [dt.datetime(year=i[0], month=i[1], day=15) for i in group2.index]

# Mean values.
y1 = group2.values

# Rolling window average
y2 = group2.rolling(3, min_periods=1).mean().values

# Exponentially weighted moving average
y3 = group2.ewm(span=3, min_periods=1).mean().values

In [None]:
second_trace = go.Scatter(x=x, y=y1, mode="markers", name="Actual value")
third_trace = go.Scatter(x=x, y=y2, line=dict(color="red"), name="Rolling average")
forth_trace = go.Scatter(x=x, y=y3, line=dict(color="green"), name="EWM average")

default_period = (dt.datetime(2008, 1, 1), dt.datetime(2017,12,1))
default_ppm_range = (380, 410)
layout = go.Layout(
    title="Seasonal fluctations of CO<sub>2</sub> levels in atmosphere",
    yaxis=dict(title="Parts per million (PPM)",range=default_ppm_range),
    xaxis=dict(title="Year", range=default_period))

figure = go.Figure(data=[second_trace, third_trace, forth_trace], layout=layout)
py.iplot(figure, filename="co2-ppm-seasonal")

## Machine Learning: CO2 Prediction Model

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [None]:
x_val = [(i.year, i.month, i.month ** 2, i.year ** 2) for i in x]
y_val = [i for i in y1]

x_train, x_test, y_train, y_test = train_test_split(x_val, y_val, test_size=0.40, random_state=45)
linearModel = linear_model.LinearRegression().fit(x_train, y_train)
print("Accuracy: ", linearModel.score(x_test, y_test))

# predicted values
pred_value = linearModel.predict(x_val)

## Future CO2 Predictions

In [None]:
# Defining timeline of years
predicted_years = range(1950, 2055)
predicted_months = range(1, 13)

predicted_x = []
for y in predicted_years:
    for j in predicted_months:
        predicted_x.append([y, j, j ** 2, y ** 2])
        
# Predict values
predicted_y = linearModel.predict(predicted_x)

x_plot = [dt.datetime(i[0], i[1], 15) for i in predicted_x]
fifth_trace = go.Scatter(x=x_plot, y=predicted_y, line=dict(color="red"), name="Predicted value")

period_default = dt.datetime(1956, 1, 1), dt.datetime(2050,12,1)
ppm_range_def = (300, 500)
layout = go.Layout(
    title="Predicted Vs. Actual CO<sub>2</sub> Concentration levels",
    yaxis=dict(title="Parts per million (PPM)", range=ppm_range_def),
    xaxis=dict(title="Year", range=period_default))
figure = go.Figure(data=[second_trace, fifth_trace], layout=layout)
py.iplot(figure, filename="co2-ppm-prediction")

## Statistical Analysis: Temperature Extremes

In [None]:
# State that had the highest average temperature level
temp_by_state.loc[temp_by_state['AverageTemperature'].idxmax()]

In [None]:
# Country that had the highest Average Temperature
# Kuwait (Western Asia)
temp_by_country.loc[temp_by_country['AverageTemperature'].idxmax()]

In [None]:
temp_by_state.loc[temp_by_state['AverageTemperatureUncertainty'].idxmax()]

## Additional Date Processing Functions

In [None]:
def mod_year(date):
    """
    returns year from date time
    """
    for i in [date]:
        first = i.split('-')[0]
        return int(first)

def mod_month(date):
    """
    returns month from date time
    """
    for i in [date]:
        second = i.split('-')[1]
        return int(second)

## Country-Level Analysis

In [None]:
temp_by_country['year'] = temp_by_country['dt'].apply(mod_year)
temp_by_country['month'] = temp_by_country['dt'].apply(mod_month)
country_new_temp_data = pd.merge(temp_by_country, CO2_df, on=['year'])

In [None]:
country_carbon = country_new_temp_data.groupby(["year", "month"]).mean()["Carbon Dioxide (ppm)"]
xx = [dt.datetime(year=i[0], month=i[1], day=15) for i in country_carbon.index]

# Mean values.
yy1 = country_carbon.values

# Rolling window average
yy2 = country_carbon.rolling(3, min_periods=1).mean().values

# Exponentially weighted moving average
yy3 = country_carbon.ewm(span=3, min_periods=1).mean().values

In [None]:
second_country_trace = go.Scatter(x=xx, y=yy1, mode="markers", name="Actual value")
third_country_trace = go.Scatter(x=xx, y=yy2, line=dict(color="red"), name="Rolling average")
forth_country_trace = go.Scatter(x=xx, y=yy3, line=dict(color="green"), name="EWM average")

## Model Training for Country Data

In [None]:
x_values = [(i.year, i.month, i.month ** 2, i.year ** 2) for i in x]
y_values = [i for i in y1]

x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.40, random_state=45)
linearModel = linear_model.LinearRegression().fit(x_train, y_train)
print("Accuracy: ", linearModel.score(x_test, y_test))

# predicted values
pred_value = linearModel.predict(x_values)

## Sort Countries by CO2 Levels

In [None]:
df1 = country_new_temp_data.groupby(['Country'])
df2 = df1.apply(lambda x: x.sort_values(["Carbon Dioxide (ppm)"]))
df3 = df2.reset_index(drop=True)
df3.head()

## Final Prediction Visualization

In [None]:
new_predicted_x = []
for y in predicted_years:
    for j in predicted_months:
        new_predicted_x.append([y, j, j ** 2, y ** 2])
        
# Predict values
new_predicted_y = linearModel.predict(new_predicted_x)

new_x_plot = [dt.datetime(i[0], i[1], 15) for i in new_predicted_x]
fifth_new_trace = go.Scatter(x=new_x_plot, y=new_predicted_y, line=dict(color="red"), name="Predicted value")

layout = go.Layout(
    title="Predicted Vs. Actual CO<sub>2</sub> Concentration levels",
    yaxis=dict(title="Parts per million (PPM)", range=ppm_range_def),
    xaxis=dict(title="Year", range=period_default))
figure = go.Figure(data=[second_country_trace, fifth_new_trace], layout=layout)
py.iplot(figure, filename="co2-ppm-prediction")