In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Dataset
data = pd.read_csv("/home/benyamain/Desktop/lulc-air-quality/dataset/california-eleven-month-mol-m2.csv")

In [None]:
# Just to understand the dist. more
bins = pd.qcut(data['NO2_column_number_density'], q=2)
print(bins.value_counts(sort=False))

# Hist.
plt.hist(data['NO2_column_number_density'], bins=2)

plt.xlabel('NO2_column_number_density')
plt.ylabel('Frequency')
plt.title('Histogram of NO2_column_number_density')

plt.savefig('/home/benyamain/Desktop/lulc-air-quality/figures/discretization-equal-frequency.png')
plt.show()


In [None]:
# Discretize the attributes using equal width method
# The big caveat is that the density levels are low and high RESPECTIVE to the dist. of the dataset
data['NO2_column_number_density'] = pd.qcut(x=data['NO2_column_number_density'], labels=["Low NO2 Levels", "High NO2 Levels"], q=2)
print(data['NO2_column_number_density'].head(100))

In [None]:
import seaborn as sns

sample = data.sample(n=50) 

sample.set_index('NO2_column_number_density', inplace=True)

# Create a heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(sample, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Relationship between LULC and NO2 Levels')
plt.savefig('/home/benyamain/Desktop/lulc-air-quality/figures/heatmap.png')
plt.show()


In [None]:
# Drop the target output from the input train set
X = data[data.columns.drop('NO2_column_number_density')]
print("Input Attributes: ", X)
# Set the target output to the output train set
y = data['NO2_column_number_density']
print("Training Labels: ", y)

In [None]:
# Imports
from sklearnex import patch_sklearn, config_context
patch_sklearn()

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Based on the dist. of the histogram
class_weights_custom = {"Low NO2 Levels": 0.20, "High NO2 Levels": 0.75}

# DTC
with config_context(target_offload="gpu:0"):
    dtc = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_split=10, min_samples_leaf=5, max_features=None, class_weight=None)

    # Train the model
    dtc.fit(X_train, y_train)

    # Predictions on the test set
    y_pred = dtc.predict(X_test)

    cross_val_scores = cross_val_score(dtc, X_train, y_train, cv=10, scoring='accuracy')

    # Eval
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

In [None]:
print("Accuracy: ", accuracy)
print(report)

# Cross validation
print(f'CV Scores: {cross_val_scores}')
print(f'Mean CV Accuracy: {cross_val_scores.mean()}')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Understanding the relationship between air quality attr. related to the LULC class labels
sns.pairplot(data, x_vars=['bare', 'built', 'crops', 'flooded_vegetation', 'grass', 'shrub_and_scrub', 'snow_and_ice', 'trees', 'water'], y_vars='NO2_column_number_density', kind='scatter')
plt.savefig('/home/benyamain/Desktop/lulc-air-quality/figures/scatterplot-individual.png')
plt.show()

In [None]:
# Drop the target output from the input train set
X = data[data.columns.drop('NO2_column_number_density')]
print("Input Attributes: ", X)
# Set the target output to the output train set
y = data['NO2_column_number_density']
print("Training Labels: ", y)

In [None]:
# Imports
from sklearnex import patch_sklearn, config_context
patch_sklearn()

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Based on the dist. of the histogram
class_weights_custom = {"Low NO2 Levels": 0.20, "High NO2 Levels": 0.75}

# Random Forest Classifier
with config_context(target_offload="gpu:0"):
    rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)

    # Train the model
    rf_clf.fit(X_train, y_train)

    # Make predictiotns for class labels
    y_pred = rf_clf.predict(X_test)

    cross_val_scores = cross_val_score(rf_clf, X_train, y_train, cv=10, scoring='accuracy')

    # Eval
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

In [None]:
print("Accuracy: ", accuracy)
print(report)

# Cross validation
print(f'CV Scores: {cross_val_scores}')
print(f'Mean CV Accuracy: {cross_val_scores.mean()}')

In [None]:
# Determining what attr. hold % of class labels
feature_importance = pd.Series(rf_clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance)

In [None]:
from sklearnex import patch_sklearn, config_context
patch_sklearn()

from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

with config_context(target_offload="gpu:0"):
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    print(models)