<a href="https://colab.research.google.com/github/AnhTCQ2785/Project/blob/main/Tr%C6%B0%C6%A1ng_Cao_Qu%E1%BB%91c_Anh_SE196237.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📊 Iris Data Visualization with Python
This project allows users to explore the Iris dataset by creating scatter plots based on a selected attribute.

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import timeit

In [None]:
class PCA:

    def __init__(self,variance_threshold = 0.85):
        self.variance_threshold = variance_threshold
        self.n_components = None # the amount of dimensions that we want our data to be
        self.components = None # the matrix that contains the vectors which maximize variales' informations
        self.mean = None # mean of X
        self.mean_after_std = None # mean of X after standardization
        self.std = None # std of X

    def fit_transform(self, X):

        # calculate mean and std of X
        self.mean = np.mean(X, axis = 0)
        self.std = np.std(X, axis = 0)

        # standardization
        X_standardized = (X - self.mean)/self.std

        # mean centering
        self.mean_after_std = np.mean(X_standardized, axis = 0)
        X_centered = X_standardized - self.mean_after_std

        # covariance, functions needs samples as columns
        cov = np.cov(X_centered.T)

        # eigenvectors, eigenvalues
        eigenvalues, eigenvectors = np.linalg.eigh(cov)

        # sort eigenvectors
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[:, idxs]

        # choose best n_components
        total_variance = np.sum(eigenvalues)
        cumulative_variance = np.cumsum(eigenvalues) / total_variance
        n_components = np.argmax(cumulative_variance >= self.variance_threshold) + 1

        # save n_components to print out
        self.n_components = n_components

        # select the top eigenvectors
        self.components = eigenvectors[:, :n_components]

        # project data
        return np.dot(X_centered, self.components)

    def transform(self, X):

        # standardization
        X_standardized = (X - self.mean)/self.std

        # mean centering
        X_centered = X_standardized - self.mean_after_std

        # transform data
        return np.dot(X_centered, self.components)

In [None]:
# Load your daily data
daily_data = pd.read_csv("daily_data.csv")
# Create a target variable based on the precipitation threshold
daily_data['rain'] = (daily_data['precipitation'] >= 8).astype(int)
daily_data = daily_data.drop(columns= 'Unnamed: 0')
# Select features (you can adjust this list based on your data and preferences)
features = daily_data.drop(columns=['date', 'precipitation', 'rain'])
target = daily_data['rain']

In [None]:
daily_data

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_direction_10m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_moisture_0_to_7cm
0,1940-10-19 00:00:00+00:00,25.526249,87.434162,23.153333,30.924877,8.1,1,1011.537500,1010.843495,61.854167,47.833333,24.708333,24.916667,3.775271,0.459955,2.901538,168.768348,19.080000,27.290833,0.477125
1,1940-10-20 00:00:00+00:00,25.692916,86.876410,23.176250,30.899445,3.9,0,1012.087500,1011.393532,47.849999,36.041667,15.375000,20.625000,3.970240,0.488919,4.186341,130.523708,24.480000,27.276250,0.470917
2,1940-10-21 00:00:00+00:00,25.605416,88.377252,23.415833,30.729465,5.0,0,1012.970833,1012.276051,54.204167,41.375000,14.250000,34.708333,3.589016,0.423146,5.188336,145.083613,28.440000,27.053333,0.468125
3,1940-10-22 00:00:00+00:00,25.795000,89.033899,23.749166,31.316226,2.0,0,1013.229167,1012.534701,58.087500,40.333333,13.583333,47.833333,3.926549,0.405469,4.745358,128.848791,20.880001,27.328333,0.460125
4,1940-10-23 00:00:00+00:00,26.232500,86.144482,23.524166,31.652917,0.8,0,1012.520833,1011.827843,35.625000,29.041667,14.500000,2.625000,3.984233,0.538280,4.356565,186.352631,16.560000,27.536666,0.448208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30677,2024-10-15 00:00:00+00:00,26.656834,88.091011,24.444334,32.424688,21.0,1,1009.979167,1009.288900,78.766667,57.458333,18.583333,97.000000,3.356322,0.447807,5.519350,126.297725,33.480000,29.404750,0.503875
30678,2024-10-16 00:00:00+00:00,26.423500,86.725035,23.936000,31.689494,6.4,0,1010.412500,1009.721373,87.337500,58.916667,28.875000,97.625000,3.096710,0.491579,6.148713,164.187947,23.400000,29.002667,0.496333
30679,2024-10-17 00:00:00+00:00,26.354750,88.439595,24.158917,32.104105,19.5,1,1010.304167,1009.612950,75.833334,41.041667,39.666667,67.125000,3.845436,0.444458,5.029527,184.941674,37.440000,29.142250,0.494250
30680,2024-10-18 00:00:00+00:00,26.008917,89.619815,24.113083,31.284578,2.8,0,1010.295833,1009.603894,66.329167,31.416667,38.041667,77.958333,2.584295,0.367095,6.385557,171.301387,23.759998,28.219334,0.492167


In [None]:
features

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_direction_10m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_moisture_0_to_7cm
0,25.526249,87.434162,23.153333,30.924877,1011.537500,1010.843495,61.854167,47.833333,24.708333,24.916667,3.775271,0.459955,2.901538,168.768348,19.080000,27.290833,0.477125
1,25.692916,86.876410,23.176250,30.899445,1012.087500,1011.393532,47.849999,36.041667,15.375000,20.625000,3.970240,0.488919,4.186341,130.523708,24.480000,27.276250,0.470917
2,25.605416,88.377252,23.415833,30.729465,1012.970833,1012.276051,54.204167,41.375000,14.250000,34.708333,3.589016,0.423146,5.188336,145.083613,28.440000,27.053333,0.468125
3,25.795000,89.033899,23.749166,31.316226,1013.229167,1012.534701,58.087500,40.333333,13.583333,47.833333,3.926549,0.405469,4.745358,128.848791,20.880001,27.328333,0.460125
4,26.232500,86.144482,23.524166,31.652917,1012.520833,1011.827843,35.625000,29.041667,14.500000,2.625000,3.984233,0.538280,4.356565,186.352631,16.560000,27.536666,0.448208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30677,26.656834,88.091011,24.444334,32.424688,1009.979167,1009.288900,78.766667,57.458333,18.583333,97.000000,3.356322,0.447807,5.519350,126.297725,33.480000,29.404750,0.503875
30678,26.423500,86.725035,23.936000,31.689494,1010.412500,1009.721373,87.337500,58.916667,28.875000,97.625000,3.096710,0.491579,6.148713,164.187947,23.400000,29.002667,0.496333
30679,26.354750,88.439595,24.158917,32.104105,1010.304167,1009.612950,75.833334,41.041667,39.666667,67.125000,3.845436,0.444458,5.029527,184.941674,37.440000,29.142250,0.494250
30680,26.008917,89.619815,24.113083,31.284578,1010.295833,1009.603894,66.329167,31.416667,38.041667,77.958333,2.584295,0.367095,6.385557,171.301387,23.759998,28.219334,0.492167


In [None]:
target

0        1
1        0
2        0
3        0
4        0
        ..
30677    1
30678    0
30679    1
30680    0
30681    1
Name: rain, Length: 30682, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Without PCA #

In [None]:
X_train

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_direction_10m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_moisture_0_to_7cm
6307,27.201084,65.753181,19.819834,30.303596,1012.983333,1012.292224,33.262501,6.708333,39.583333,11.583333,4.091790,1.344789,6.218333,103.841486,25.560000,30.936500,0.137542
3652,25.867750,85.928737,23.165668,31.082421,1007.779167,1007.088588,62.258334,28.833333,15.000000,92.958333,3.858950,0.524270,3.782754,194.196958,21.240000,28.003167,0.468542
18118,27.099001,84.482256,24.194834,31.856111,1008.312500,1007.624387,64.458334,22.208333,28.500000,97.458333,2.965409,0.578095,10.142417,220.460882,39.960000,28.653167,0.456292
6340,26.711501,70.694484,20.053167,29.753593,1010.666667,1009.975985,8.800000,3.250000,9.791667,0.000000,5.187990,1.220139,8.810456,137.075064,29.160000,32.146917,0.112125
14699,25.003167,69.051239,18.359417,27.371144,1014.300000,1013.602917,37.112500,16.083333,15.916667,43.625000,3.962118,1.100943,5.991786,256.670703,28.080000,29.232334,0.118625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,28.238083,82.358165,24.790167,33.758173,1008.950000,1008.264016,37.325001,4.000000,7.166667,98.083333,4.090953,0.735004,8.345026,240.214697,30.239998,29.463083,0.454708
5390,27.219834,84.685142,24.274001,32.526880,1008.458333,1007.770386,53.800001,20.458333,9.750000,98.458333,4.176296,0.607966,7.677083,213.114719,27.000000,29.061500,0.456792
860,25.767916,57.399983,15.832500,27.296628,1010.491667,1009.798880,21.875001,0.083333,15.833333,41.000000,5.269562,1.589580,6.588245,170.094337,27.000000,29.053333,0.219417
15795,25.849000,69.077868,19.163584,28.497403,1009.000000,1008.308494,43.125000,10.958333,10.750000,89.375000,4.109115,1.175182,6.859528,163.329532,25.560000,30.488584,0.116417


In [None]:
X_test

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_direction_10m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_moisture_0_to_7cm
22325,25.453167,82.794096,22.096917,29.847464,1010.637500,1009.943983,67.129167,23.625000,40.250000,88.833333,2.017790,0.618722,4.013220,178.300170,14.759999,27.399001,0.168750
25941,26.765667,83.747526,23.534417,32.214810,1009.304167,1008.614610,18.050000,8.250000,8.333333,18.750000,4.347826,0.659152,4.555262,169.269148,34.920000,29.011500,0.459167
9132,27.069834,80.072674,23.099001,32.022924,1009.916667,1009.227360,49.225000,16.666667,13.625000,86.833333,4.155281,0.785456,5.612451,100.165696,35.640000,28.438584,0.441583
10285,26.667751,62.529646,18.424001,29.044950,1013.691667,1012.998790,9.512500,2.541667,4.208333,15.666667,4.809146,1.440176,7.139265,97.220722,22.319998,31.188584,0.117542
1090,25.311666,92.916524,24.036666,30.784675,1010.425000,1009.731370,77.812501,53.333333,16.958333,98.041667,2.723598,0.248580,4.681659,230.902901,31.319998,26.705416,0.471333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29578,25.890167,87.410423,23.611000,29.431091,1005.375000,1004.686162,82.575001,29.708333,58.333333,99.083333,2.891644,0.432650,16.092053,252.742225,43.920000,27.029750,0.506667
18379,27.599001,73.279100,21.630251,31.620779,1010.133333,1009.445040,14.762500,4.833333,6.041667,22.625000,5.024046,1.167917,7.656514,143.413898,28.080000,33.317750,0.111167
1535,24.213750,89.266117,22.234583,28.508472,1009.537500,1008.841842,80.341667,30.833333,58.916667,74.458333,2.364968,0.351626,5.291214,208.446769,16.919998,25.424166,0.461375
2878,24.786666,92.593030,23.465833,29.338297,1009.708333,1009.013947,75.445835,20.000000,62.041667,89.875000,2.267832,0.247140,8.408511,225.989602,38.880000,25.878333,0.489625


In [None]:
def run_knn():
    k = 5
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    # Evaluate the model
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:\n", classification_report(y_test, y_pred))
execution_time = timeit.timeit(run_knn, number=1)
print("Time taken to run the code:", f'{execution_time:.3f}', "seconds")

Accuracy: 81.87%
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88      6864
           1       0.65      0.62      0.64      2341

    accuracy                           0.82      9205
   macro avg       0.76      0.75      0.76      9205
weighted avg       0.82      0.82      0.82      9205

Time taken to run the code: 0.379 seconds


# With PCA #

In [None]:
pca = PCA()
X_train_pca = pca.fit_transform(X_train.values)
X_test_pca = pca.transform(X_test.values)

In [None]:
pca.n_components

np.int64(5)

In [None]:
X_train_pca

array([[-3.42450384, -1.96174015,  0.09667758,  1.53721328, -0.80012562],
       [ 2.27791266, -0.30463497,  1.3816926 , -0.15393244,  1.01853097],
       [ 2.34361366,  1.20472644, -1.08796857,  0.15563015, -0.28183017],
       ...,
       [-3.80138863, -2.45123669, -1.6888784 , -0.359465  ,  1.80166826],
       [-1.5926433 , -1.20107317, -0.90323814,  0.49840597,  1.72569505],
       [ 3.90469577,  0.3223505 , -2.56261416,  0.21497921, -0.3109797 ]])

In [None]:
X_test_pca

array([[ 1.57358969, -2.65165298,  0.9247604 ,  1.72300889,  0.71286676],
       [-0.32838164,  0.42407804,  1.07091019, -1.88824542, -0.93322524],
       [ 0.21898331,  0.19957565,  0.56286952,  0.12994132, -1.04122399],
       ...,
       [ 3.72941857, -2.9764135 ,  0.10349452,  1.16406156,  0.33362006],
       [ 3.98273625, -1.45103652, -1.75271006,  0.66197076, -0.87092013],
       [ 1.29522602, -2.59416641,  2.24263077,  0.1871296 , -0.72883351]])

In [None]:
def run_knn():
    k = 5
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_pca, y_train)
    y_pred = knn.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    # Evaluate the model
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:\n", classification_report(y_test, y_pred))
execution_time = timeit.timeit(run_knn, number=1)
print("Time taken to run the code:", f'{execution_time:.3f}', "seconds")

Accuracy: 82.55%
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88      6864
           1       0.67      0.62      0.64      2341

    accuracy                           0.83      9205
   macro avg       0.77      0.76      0.76      9205
weighted avg       0.82      0.83      0.82      9205

Time taken to run the code: 0.336 seconds


MODEL
