# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from google.colab import drive
import warnings
warnings.filterwarnings("ignore")

drive.mount('/content/drive')
plt.style.use('ggplot')

# Read CSV

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Project_Kelompok_3/co2_emission.csv')

In [None]:
y_2020 = df.loc[df["Year"]==2020]
kt_2020 =y_2020["total_emission"].sum()
gt_2020 = round(kt_2020 / 1_000_000,2) # the amount of gigatones of CO2
perc = gt_2020 / 30
print(f"The amount of CO2 from agrifood in 2020 is {gt_2020} gigatones (gt), that is:",round(perc * 100), "%","of total emissions!")

# EDA

In [None]:
def normalizer(df):
    norm = (df - df.max()) / (df.max() - df.min())
    return norm

temp_emission = df.groupby("Year").agg({"Average Temperature °C": "mean", "total_emission": "mean", "Urban population": "mean"})
norm_temp = normalizer(temp_emission)


norm_temp.plot(figsize=(20, 6))
plt.title("CO2 Emission & Temperature")
plt.show()

In [None]:
def plot_co2_trend(nation):
    ita = df.loc[df.Area == nation]
    ita = ita.set_index("Year")
    plt.figure(figsize=(12,6))
    ita["total_emission"].plot(kind = "line", color = "green")
    plt.title(f"{nation} CO2 trend")
    plt.show()

fig = px.box(df, x="Year",
             y="Average Temperature °C",
             color="Year",
             color_discrete_sequence=px.colors.sequential.Viridis,
             title='<b>Average temperature distribution by years')
fig.show()

# Feature Engineering

In [None]:
df["pop_tot"] = df["Total Population - Male"] + df["Total Population - Female"]

continent_mapping = {
    'Africa': ['Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Congo', 'Côte d\'Ivoire', 'Djibouti', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'São Tomé and Príncipe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe'],
    'Asia': ['Afghanistan', 'Armenia', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Bhutan', 'Brunei', 'Cambodia', 'China', 'Cyprus', 'Georgia', 'India', 'Indonesia', 'Iran', 'Iraq', 'Israel', 'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Lebanon', 'Malaysia', 'Maldives', 'Mongolia', 'Myanmar', 'Nepal', 'North Korea', 'Oman', 'Pakistan', 'Palestine', 'Philippines', 'Qatar', 'Russia', 'Saudi Arabia', 'Singapore', 'South Korea', 'Sri Lanka', 'Syria', 'Taiwan', 'Tajikistan', 'Thailand', 'Timor-Leste', 'Turkey', 'Turkmenistan', 'United Arab Emirates', 'Uzbekistan', 'Vietnam', 'Yemen'],
    'Europe': ['Albania', 'Andorra', 'Austria', 'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria', 'Croatia', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal', 'Romania', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United Kingdom', 'Vatican City'],
    'North America': ['Antigua and Barbuda', 'Bahamas', 'Barbados', 'Belize', 'Canada', 'Costa Rica', 'Cuba', 'Dominica', 'Dominican Republic', 'El Salvador', 'Grenada', 'Guatemala', 'Haiti', 'Honduras', 'Jamaica', 'Mexico', 'Nicaragua', 'Panama', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines', 'Trinidad and Tobago', 'United States'],
    'Oceania': ['Australia', 'Fiji', 'Kiribati', 'Marshall Islands', 'Micronesia', 'Nauru', 'New Zealand', 'Palau', 'Papua New Guinea', 'Samoa', 'Solomon Islands', 'Tonga', 'Tuvalu', 'Vanuatu'],
    'South America': ['Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Guyana', 'Paraguay', 'Peru', 'Suriname', 'Uruguay', 'Venezuela']
}

#This function assign a continent label to each country in the df
def assign_continent(country):
    for continent, countries in continent_mapping.items():
        if country in countries:
            return continent
    return None

df["continent"] = df["Area"].apply(assign_continent)

In [None]:
px.scatter(df, df["Average Temperature °C"],
           df["total_emission"],
           size= "pop_tot",
           title = "<b>CO2 Emission & Temperature - population",
           template="plotly_dark",
           color ="continent")

In [None]:
correlation = df.groupby(["Year"]).agg({"total_emission":"sum", "Average Temperature °C":"mean", "pop_tot":"sum"})
correlation.corr()

In [None]:
px.scatter(correlation.reset_index(),
    x = "Year",
    y = "Average Temperature °C",
    size = "pop_tot",
    color = "total_emission",
    title = "<br> Temperature & CO2 Emissions - global relation",
    template="plotly_dark")

In [None]:
!pip install pycountry
import pycountry

In [None]:
def country_emission(df,year, length = 10):
        df = df.copy()
        plot = df.loc[df["Year"]==year]
        plot = plot.sort_values(by = "total_emission", ascending = True).tail(length)
        colors = plt.cm.get_cmap('plasma', len(plot))
        plt.figure(figsize=(10, 5))
        plt.barh(plot['Area'],
                plot['total_emission'],
                color=colors(range(len(plot))))
        plt.title(f'CO2 Emission by top {length} country in {year}')
        plt.xlabel('CO2 Emission in kilotones')

        plt.show()

country_emission(df, year=2020)

In [None]:
df["per_capita_emission_kt"] = df["total_emission"] / df["pop_tot"]

def percapita_emission(df,year, length = 30):
        df = df.copy()
        # let's remove small island or counties ehit this filter
        plot = df.loc[(df["Year"]==year) & (df["pop_tot"] > 800000)]
        plot = plot.sort_values(by = "per_capita_emission_kt", ascending = True).tail(length)
        colors = plt.cm.get_cmap('viridis', len(plot))
        plt.figure(figsize=(10, 5))
        plt.barh(plot['Area'],
                plot['per_capita_emission_kt'],
                color=colors(range(len(plot))))
        plt.title(f'CO2 per capita Emission by top {length} country in {year}')
        plt.show()

percapita_emission(df, year=2020, length=10)

In [None]:
def continental_emission(year =2020):
    continent_df = df.loc[(df.pop_tot > 500000) & (df.Year == year)]\
        .groupby("continent")\
        .agg({"total_emission":"sum",
            "Average Temperature °C":"median",
            "per_capita_emission_kt":"mean"}).reset_index()
    continent_df = continent_df.sort_values(by="total_emission", ascending=False)
    colors = plt.cm.get_cmap('viridis', len(continent_df))

    fig, ax = plt.subplots(1, 2, figsize=(14, 6))
    ax[0].bar(continent_df["continent"], continent_df.total_emission, color="blue")
    ax[0].set_title(f"Total CO2 emissions in {year}")

    ax2 = ax[0].twinx()
    ax2.plot(continent_df["continent"], continent_df["Average Temperature °C"], color='green', marker='o')
    ax2.legend(["increasing avg temperature C°"], loc='upper right')


    continent_df = continent_df.sort_values(by="per_capita_emission_kt", ascending=False)
    ax[1].bar(continent_df["continent"], continent_df["per_capita_emission_kt"])
    ax[1].set_title(f"Total CO2 per capita emissions in {year}")
    ax3 = ax[1].twinx()
    ax3.plot(continent_df["continent"], continent_df["Average Temperature °C"], color='green', marker='o')
    ax3.legend(["increasing avg temperature C°"], loc='upper right')

    for axis in ax:
        axis.set_xticklabels(axis.get_xticklabels(), rotation='vertical')

    plt.tight_layout()
    plt.show()

continental_emission(year =2020)

# Preprocessing

## Missing Value

In [None]:
df

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor

In [None]:
df.isna().sum()

In [None]:
def regressor_imputer(df, feature, max_depth = 6):

    df_filled = df.copy()

    if df_filled[feature].isna().any():
        missing_data = df_filled[df_filled[feature].isna()]
        non_missing_data = df_filled.dropna(subset=[feature])

        X_train = non_missing_data.drop(columns=[feature])
        y_train = non_missing_data[feature]

        imputer = SimpleImputer()
        X_train_imputed = imputer.fit_transform(X_train)
        X_missing = missing_data.drop(columns=[feature])
        X_missing_imputed = imputer.transform(X_missing)

        rf = RandomForestRegressor(max_depth = max_depth)
        rf.fit(X_train_imputed, y_train)

        y_missing_pred = rf.predict(X_missing_imputed)

        df_filled.loc[df_filled[feature].isna(), feature] = y_missing_pred

    return df_filled

num_features = [col for col in df.columns if df[col].dtypes in ["int64", "float64"]]
cat_features = [col for col in df.columns if df[col].dtypes in ["object"]]

# let's create a list with onli missing values feature and the replace them with regression
missing_values = df[num_features].isna().sum()
missing_list=missing_values[missing_values > 0].keys().tolist()

In [None]:
numeric_only = df[num_features ]
def replace_missing(df, missing_list):
    numeric_df = df.copy()
    for feature in tqdm(missing_list):
        numeric_df = regressor_imputer(numeric_df, feature)
    return numeric_df

num_df = replace_missing(numeric_only, missing_list)

In [None]:
statistik = df['Average Temperature °C'].describe()

Q1 = statistik['25%']
median = statistik['50%']
Q3 = statistik['75%']

low_threshold = Q1
high_threshold = Q3

def categorize_value(value):
    if value <= low_threshold:
        return 'Rendah'
    elif value <= high_threshold:
        return 'Sedang'
    else:
        return 'Tinggi'

df['Outcome'] = df['Average Temperature °C'].apply(categorize_value)
df

In [None]:
cat_df = df[cat_features].copy()

label_encoder = LabelEncoder()
for column in cat_df.columns:
    cat_df.loc[:, column] = label_encoder.fit_transform(cat_df[column])

In [None]:
final_df = pd.concat([df['Area'], cat_df, num_df, df['Outcome']], axis =1)

In [None]:
nama_file = 'data_final.csv'
final_df.to_csv(nama_file, index=False)  # index=False untuk tidak menyimpan indeks dataframe ke file CSV
from google.colab import files
files.download(nama_file)

In [None]:
final_df.head()

In [None]:
final_df.isna().sum()

# Evaluasi Model

## Klasifikasi

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    "XGBoost":XGBClassifier(),
}

In [None]:
label_encoder = LabelEncoder()
final_df['Outcome Encode'] = label_encoder.fit_transform(df['Outcome'])

In [None]:
   X = final_df.drop(columns =["Average Temperature °C","Outcome Encode"])
y = final_df["Outcome Encode"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=56 )

In [None]:
model = XGBClassifier()

# Parameter yang akan di-tune
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Parameter terbaik: {best_params}")

best_model = XGBClassifier(**best_params)

best_model.fit(X_train, y_train)

predictions = best_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Akurasi model setelah tuning: {accuracy}")

In [None]:
    y_pred = model.predict(X_test)


    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

    metrics_df = pd.DataFrame({'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
                               'Score': [accuracy, precision, recall, f1]})

    print(f"{name}:\n {metrics_df}\n")

In [None]:
from sklearn.metrics import confusion_matrix


for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.set(font_scale=1.2)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
                xticklabels=["Rendah", "Sedang","Tinggi"],
                yticklabels=["Rendah", "Sedang","Tinggi"])
    plt.xlabel('Prediksi')
    plt.ylabel('Aktual')
    plt.title('Confusion Matrix ' + name)
    plt.show()


In [None]:
df.shape

## Feature Importance

In [None]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split

X = final_df.drop(columns=['continent', 'Area','Year','Average Temperature °C', 'Fires in humid tropical forests', 'Fertilizers Manufacturing','Manure left on Pasture', 'Manure applied to Soils', 'Total Population - Male', 'Total Population - Female','Rural population','Urban population','per_capita_emission_kt','Outcome','Outcome Encode'])  # Mengambil semua kolom kecuali kolom 'Outcome' sebagai fitur
y = final_df['Outcome Encode']  # Kolom target

# Bagi dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisialisasi model XGBoost
model = xgb.XGBClassifier()

# Latih model pada data latih
model.fit(X_train, y_train)

# Menghitung feature importance
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Tampilkan feature importance
print(feature_importance)


In [None]:
top_5_features = feature_importance.head(5)  # Mengambil lima fitur teratas

plt.figure(figsize=(8, 6))
plt.barh(top_5_features['Feature'], top_5_features['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Top 5 Feature Importance')
plt.gca().invert_yaxis()  # Supaya fitur dengan importance tinggi berada di bagian atas
plt.show()
