In [93]:
import pandas as pd
import numpy as np
# Visualization
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

# Feature engineering
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


# Models (liner models)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


# Models (tree-based models)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# training
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split


# Testing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [94]:
data = pd.read_csv('../Data/raw/seattle-weather.csv')
data

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain
1457,2015-12-28,1.5,5.0,1.7,1.3,rain
1458,2015-12-29,0.0,7.2,0.6,2.6,fog
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun


In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [96]:
data.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [97]:
data.isna().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [98]:
graph = px.line(data, x='date', y='precipitation', title='Seattle Weather Precipitation Over Time')
graph.update_layout(
    xaxis_title='Date',
    yaxis_title='Precipitation (inches)',
    title_x=0.5
)

In [99]:
for col in data.columns:
    if data[col].dtype != 'category' and data[col].dtype != 'object':
        graph = px.histogram(data, x=col, title=f'Distribution of {col}')
        graph.update_layout(
            xaxis_title=col,
            yaxis_title='Count',
            title_x=0.5
        )
        pio.show(graph)

In [100]:
graph = px.histogram(data, x='weather', title='Weather Condition Count')
graph.update_layout(
    xaxis_title='Weather Condition',
    yaxis_title='Count',
    title_x=0.5
)
pio.show(graph)

In [101]:
for col in data.columns:
    if data[col].dtype != 'object':
        graph = px.scatter(data, x=col, y='weather', title=f'{col} Over Time')
        graph.update_layout(
            xaxis_title=col,
            yaxis_title='Weather Condition',
            title_x=0.5
        )
        pio.show(graph)

In [102]:
x  = data.drop(columns=['date', 'weather'])
corr_matrix = x.corr()

fig = px.imshow(
    corr_matrix,
    text_auto=True,           
    color_continuous_scale="RdBu",  
    zmin=-1, zmax=1,           
    title="Correlation Heatmap"
)

fig.show()

In [103]:
# box plot to all the data 
for col in x.columns:
    graph = px.box(data, y=col, title=f'Box Plot of {col}')
    graph.update_layout(
        yaxis_title=col,
        title_x=0.5
    )
    pio.show(graph)

In [104]:
data.drop('date',axis=1, inplace=True)
data

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.0,12.8,5.0,4.7,drizzle
1,10.9,10.6,2.8,4.5,rain
2,0.8,11.7,7.2,2.3,rain
3,20.3,12.2,5.6,4.7,rain
4,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...
1456,8.6,4.4,1.7,2.9,rain
1457,1.5,5.0,1.7,1.3,rain
1458,0.0,7.2,0.6,2.6,fog
1459,0.0,5.6,-1.0,3.4,sun


In [105]:
data.duplicated().sum()

np.int64(8)

In [106]:
data.drop_duplicates(inplace=True)
data

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.0,12.8,5.0,4.7,drizzle
1,10.9,10.6,2.8,4.5,rain
2,0.8,11.7,7.2,2.3,rain
3,20.3,12.2,5.6,4.7,rain
4,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...
1456,8.6,4.4,1.7,2.9,rain
1457,1.5,5.0,1.7,1.3,rain
1458,0.0,7.2,0.6,2.6,fog
1459,0.0,5.6,-1.0,3.4,sun


In [107]:
def get_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] < lower_bound) | (df[col] > upper_bound)]

In [108]:
for col in data.columns:
    if data[col].dtype != 'object':
        col_outliers = get_outliers(data, col)
        print(f'Outliers in {col}:', col_outliers.shape[0])


Outliers in precipitation: 199
Outliers in temp_max: 0
Outliers in temp_min: 0
Outliers in wind: 27


In [109]:
# drop outliers of wind 
data = data[~data['wind'].isin(get_outliers(data, 'wind')['wind'])]
data 


Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.0,12.8,5.0,4.7,drizzle
1,10.9,10.6,2.8,4.5,rain
2,0.8,11.7,7.2,2.3,rain
3,20.3,12.2,5.6,4.7,rain
4,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...
1456,8.6,4.4,1.7,2.9,rain
1457,1.5,5.0,1.7,1.3,rain
1458,0.0,7.2,0.6,2.6,fog
1459,0.0,5.6,-1.0,3.4,sun


In [110]:
def change_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    return df

In [111]:
data  = change_outliers(data, 'precipitation')
data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.00,12.8,5.0,4.7,drizzle
1,6.25,10.6,2.8,4.5,rain
2,0.80,11.7,7.2,2.3,rain
3,6.25,12.2,5.6,4.7,rain
4,1.30,8.9,2.8,6.1,rain
...,...,...,...,...,...
1456,6.25,4.4,1.7,2.9,rain
1457,1.50,5.0,1.7,1.3,rain
1458,0.00,7.2,0.6,2.6,fog
1459,0.00,5.6,-1.0,3.4,sun


In [112]:
data['temp_avg'] = (data['temp_max'] + data['temp_min']) / 2
data['temp_diff'] = data['temp_max'] - data['temp_min']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [113]:
data['precipitation'] = np.log1p(data['precipitation'])
data['wind'] = np.sqrt(data['wind'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [114]:
# visualization after transformation
for col in data.columns:
    if data[col].dtype != 'object':
        graph = px.histogram(data, x=col, title=f'Distribution of {col} After Transformation')
        graph.update_layout(
            xaxis_title=col,
            yaxis_title='Count',
            title_x=0.5
        )
        pio.show(graph)

In [115]:
data

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,temp_avg,temp_diff
0,0.000000,12.8,5.0,2.167948,drizzle,8.90,7.8
1,1.981001,10.6,2.8,2.121320,rain,6.70,7.8
2,0.587787,11.7,7.2,1.516575,rain,9.45,4.5
3,1.981001,12.2,5.6,2.167948,rain,8.90,6.6
4,0.832909,8.9,2.8,2.469818,rain,5.85,6.1
...,...,...,...,...,...,...,...
1456,1.981001,4.4,1.7,1.702939,rain,3.05,2.7
1457,0.916291,5.0,1.7,1.140175,rain,3.35,3.3
1458,0.000000,7.2,0.6,1.612452,fog,3.90,6.6
1459,0.000000,5.6,-1.0,1.843909,sun,2.30,6.6


In [None]:
# use standard scaler
scaler = StandardScaler()

In [117]:

columns = data.columns.tolist()
columns.remove('weather')  
data[columns] = scaler.fit_transform(data[columns])
data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,temp_avg,temp_diff
0,-0.730530,-0.505661,-0.649504,1.160381,drizzle,-0.581805,-0.117816
1,1.777913,-0.804619,-1.086360,1.034422,rain,-0.947930,-0.117816
2,0.013755,-0.655140,-0.212649,-0.599208,rain,-0.490274,-0.981399
3,1.777913,-0.587195,-0.530362,1.160381,rain,-0.581805,-0.431846
4,0.324141,-1.035632,-1.086360,1.975837,rain,-1.089387,-0.562692
...,...,...,...,...,...,...,...
1456,1.777913,-1.647136,-1.304787,-0.095774,rain,-1.555363,-1.452444
1457,0.429723,-1.565602,-1.304787,-1.615996,rain,-1.505437,-1.295429
1458,-0.730530,-1.266645,-1.523215,-0.340212,fog,-1.413906,-0.431846
1459,-0.730530,-1.484068,-1.840928,0.285036,sun,-1.680178,-0.431846


#### we cant remove outliers in this dataset as it is a weather dataset and outliers are expected due to natural variations in weather conditions.


In [118]:
# saving data
data.to_csv('../Data/processed/seattle-weather-processed.csv', index=False)

In [119]:
x = data.drop(columns=['weather'])
y = data['weather']

In [120]:
smote_enn = SMOTEENN(
    sampling_strategy="auto",  # can also set ratio like 0.5 for 50% balance
    random_state=42,
    n_jobs=-1
)
x, y = smote_enn.fit_resample(x, y)

In [121]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [122]:
# Label encoding for the weather column
df = x 
df['weather'] = y
le = LabelEncoder()
df['weather'] = le.fit_transform(df['weather'])
df 

Unnamed: 0,precipitation,temp_max,temp_min,wind,temp_avg,temp_diff,weather
0,-0.73053,-0.505661,-0.649504,1.160381,-0.581805,-0.117816,0
1,-0.73053,-1.334590,-2.079213,-1.499728,-1.688499,0.170045,0
2,-0.73053,-1.266645,-1.523215,-1.071767,-1.413906,-0.431846,0
3,-0.73053,-0.505661,-0.431077,0.905632,-0.490274,-0.405677,0
4,-0.73053,-0.206703,-0.867932,0.706693,-0.490274,0.745766,0
...,...,...,...,...,...,...,...
2151,-0.73053,0.853238,0.343349,0.500144,0.666346,1.190642,4
2152,-0.73053,0.173789,0.462491,-0.340212,0.300222,-0.274831,4
2153,-0.73053,-0.967687,-1.840928,0.905632,-1.363980,0.562582,4
2154,-0.73053,-0.967687,-1.960070,-0.017132,-1.413906,0.719597,4


In [123]:
x_enc = df.drop(columns=['weather'])
y_enc = df['weather']

In [124]:
x_label_train, x_label_test, y_label_train, y_label_test = train_test_split(x_enc, y_enc, test_size=0.2, random_state=42)

In [125]:
# logestic 
lg = LogisticRegression(max_iter=1000, random_state=42)
lg.fit(x_label_train, y_label_train)

In [126]:
lg_score = lg.score(x_label_test, y_label_test)
lg_score

0.6898148148148148

In [127]:
predict_lg = lg.predict(x_label_test)

In [128]:
# classification report
lg_report = classification_report(y_label_test, predict_lg, target_names=le.classes_)
print(lg_report)

              precision    recall  f1-score   support

     drizzle       0.38      0.37      0.38        89
         fog       0.35      0.38      0.36        80
        rain       0.96      0.94      0.95        87
        snow       0.97      0.98      0.97       135
         sun       0.54      0.51      0.53        41

    accuracy                           0.69       432
   macro avg       0.64      0.64      0.64       432
weighted avg       0.69      0.69      0.69       432



In [129]:
conf_matrix = confusion_matrix(y_label_test, predict_lg)

conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')

pio.show(conf_matrix_fig)

In [130]:
# svm 
svm = SVC(kernel='linear', random_state=42)
svm.fit(x_label_train, y_label_train)

In [131]:
svm_score = svm.score(x_label_test, y_label_test)
svm_score

0.7037037037037037

In [132]:
predict_svm = svm.predict(x_label_test)

In [133]:
# classification report
svm_report = classification_report(y_label_test, predict_svm, target_names=le.classes_)
print(svm_report)

              precision    recall  f1-score   support

     drizzle       0.43      0.42      0.42        89
         fog       0.40      0.44      0.42        80
        rain       0.97      0.95      0.96        87
        snow       0.97      0.98      0.97       135
         sun       0.49      0.41      0.45        41

    accuracy                           0.70       432
   macro avg       0.65      0.64      0.64       432
weighted avg       0.71      0.70      0.70       432



In [134]:
# confusion matrix
conf_matrix = confusion_matrix(y_label_test, predict_svm)
conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')

pio.show(conf_matrix_fig)

In [135]:
# sgd classifier
sgd = SGDClassifier(random_state=42)
sgd.fit(x_label_train, y_label_train)

In [136]:
sgd_score = sgd.score(x_label_test, y_label_test)
sgd_score

0.7013888888888888

In [137]:
predict_sgd = sgd.predict(x_label_test)

In [138]:
# classification report
sgd_report = classification_report(y_label_test, predict_sgd, target_names=le.classes_)
print(sgd_report)

              precision    recall  f1-score   support

     drizzle       0.47      0.43      0.45        89
         fog       0.44      0.36      0.40        80
        rain       0.95      0.87      0.91        87
        snow       0.89      0.99      0.94       135
         sun       0.47      0.66      0.55        41

    accuracy                           0.70       432
   macro avg       0.65      0.66      0.65       432
weighted avg       0.69      0.70      0.69       432



In [139]:
# confusion matrix
conf_matrix = confusion_matrix(y_label_test, predict_sgd)
conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')
pio.show(conf_matrix_fig)

In [140]:
# GaussianNB
gnb = GaussianNB()
gnb.fit(x_label_train, y_label_train)

In [141]:
gnb_score = gnb.score(x_label_test, y_label_test)
gnb_score

0.7129629629629629

In [142]:
predict_gnb = gnb.predict(x_label_test)

In [143]:
# classification report
gnb_report = classification_report(y_label_test, predict_gnb,target_names=le.classes_)
print(gnb_report)

              precision    recall  f1-score   support

     drizzle       0.53      0.35      0.42        89
         fog       0.52      0.59      0.55        80
        rain       0.95      0.87      0.91        87
        snow       0.92      0.97      0.95       135
         sun       0.38      0.56      0.45        41

    accuracy                           0.71       432
   macro avg       0.66      0.67      0.66       432
weighted avg       0.72      0.71      0.71       432



In [144]:
# confusion matrix
conf_matrix = confusion_matrix(y_label_test, predict_gnb)
conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')
pio.show(conf_matrix_fig)

In [145]:
# knn 
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_label_train, y_label_train)

In [146]:
knn_score = knn.score(x_label_test, y_label_test)
knn_score

0.9675925925925926

In [147]:
predict_knn = knn.predict(x_label_test)

In [148]:
# classification report
knn_report = classification_report(y_label_test, predict_knn, target_names=le.classes_)
print(knn_report)

              precision    recall  f1-score   support

     drizzle       0.91      0.99      0.95        89
         fog       0.95      0.95      0.95        80
        rain       0.99      0.99      0.99        87
        snow       1.00      1.00      1.00       135
         sun       1.00      0.80      0.89        41

    accuracy                           0.97       432
   macro avg       0.97      0.95      0.96       432
weighted avg       0.97      0.97      0.97       432



In [149]:
# confusion matrix
conf_matrix = confusion_matrix(y_label_test, predict_knn)
conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')
pio.show(conf_matrix_fig)

In [150]:
import pickle
pickle.dump(knn, open('../Artifacts/knn-model.pkl', 'wb'))

In [151]:
# dt 
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_label_train, y_label_train)

In [152]:
predict_dt = dt.predict(x_label_test)

In [153]:
dt_score = dt.score(x_label_test, y_label_test)
dt_score

0.9467592592592593

In [154]:
# classification report
dt_report = classification_report(y_label_test, predict_dt, target_names=le.classes_)
print(dt_report)

              precision    recall  f1-score   support

     drizzle       0.94      0.90      0.92        89
         fog       0.90      0.89      0.89        80
        rain       1.00      0.99      0.99        87
        snow       0.99      1.00      1.00       135
         sun       0.80      0.90      0.85        41

    accuracy                           0.95       432
   macro avg       0.93      0.94      0.93       432
weighted avg       0.95      0.95      0.95       432



In [155]:
# confussion matrix
conf_matrix = confusion_matrix(y_label_test, predict_dt)
conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')

conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')
pio.show(conf_matrix_fig)

In [156]:
pickle.dump(dt, open('../Artifacts/dt-model.pkl', 'wb'))

In [157]:
# rf 
rf = RandomForestClassifier(random_state=42)
rf.fit(x_label_train, y_label_train)


In [158]:
rf_score = rf.score(x_label_test, y_label_test)
rf_score

0.9699074074074074

In [159]:
predict_rf = rf.predict(x_label_test)

In [160]:
# classification report
report_rf = classification_report(y_label_test, predict_rf, target_names=le.classes_)
print(report_rf)

              precision    recall  f1-score   support

     drizzle       0.98      0.92      0.95        89
         fog       0.91      0.96      0.93        80
        rain       1.00      0.99      0.99        87
        snow       0.99      1.00      1.00       135
         sun       0.95      0.95      0.95        41

    accuracy                           0.97       432
   macro avg       0.97      0.96      0.96       432
weighted avg       0.97      0.97      0.97       432



In [161]:
# confusion matrix
conf_matrix = confusion_matrix(y_label_test, predict_rf)
conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')
pio.show(conf_matrix_fig)

In [162]:
pickle.dump(rf, open('../Artifacts/rf-model.pkl', 'wb'))

In [163]:
# xgboost 
xgb = XGBClassifier(random_state=42)
xgb.fit(x_label_train, y_label_train)

In [164]:
predict_xgb = xgb.predict(x_label_test)

In [165]:
xgb_score = xgb.score(x_label_test, y_label_test)
xgb_score

0.9699074074074074

In [166]:
# classification report
report_xgb = classification_report(y_label_test, predict_xgb, target_names=le.classes_)
print(report_xgb)

              precision    recall  f1-score   support

     drizzle       0.96      0.96      0.96        89
         fog       0.95      0.95      0.95        80
        rain       1.00      0.99      0.99        87
        snow       0.99      1.00      1.00       135
         sun       0.90      0.90      0.90        41

    accuracy                           0.97       432
   macro avg       0.96      0.96      0.96       432
weighted avg       0.97      0.97      0.97       432



In [167]:
# confusion matrix
conf_matrix = confusion_matrix(y_label_test, predict_xgb)
conf_matrix_fig = px.imshow(conf_matrix, 
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')
pio.show(conf_matrix_fig)

In [168]:
pickle.dump(xgb, open('../Artifacts/xgb-model.pkl', 'wb'))

In [169]:
# catboost
catboost = CatBoostClassifier(random_state=42, verbose=0)
catboost.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x1a5e6d73ec0>

In [170]:
cb_score = catboost.score(x_test, y_test)
cb_score

np.float64(0.9722222222222222)

In [171]:
predict_cb = catboost.predict(x_test)

In [172]:
report_catboost = classification_report(y_test, predict_cb, target_names=le.classes_)
print(report_catboost)

              precision    recall  f1-score   support

     drizzle       0.96      0.98      0.97        89
         fog       0.96      0.95      0.96        80
        rain       1.00      0.99      0.99        87
        snow       0.99      1.00      1.00       135
         sun       0.90      0.88      0.89        41

    accuracy                           0.97       432
   macro avg       0.96      0.96      0.96       432
weighted avg       0.97      0.97      0.97       432



In [173]:
# confusion matrix
conf_matrix = confusion_matrix(y_test, predict_cb)
conf_matrix_fig = px.imshow(conf_matrix,
                            labels=dict(x="Predicted", y="True", color="Count"),
                            x=le.classes_, 
                            y=le.classes_,
                            color_continuous_scale='Blues',
                            title='Confusion Matrix')
conf_matrix_fig.update_layout(
    xaxis_title='Predicted Weather Condition',
    yaxis_title='True Weather Condition')
pio.show(conf_matrix_fig)

In [174]:

models = ['Logistic Regression', 'SVM', 'SGD Classifier', 'GaussianNB', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'CatBoost']
scores = [lg_score, svm_score, sgd_score, gnb_score, knn_score, dt_score, rf_score, xgb_score, cb_score]
fig = go.Figure(data=[go.Bar(
    x=scores,
    y=models,
    orientation='h'
)])
fig.update_layout(
    title='Model Comparison',
    xaxis_title='Accuracy Score',
    yaxis_title='Models',
    title_x=0.5
)
fig.show()

In [175]:
# Cross-validation for CatBoost as it is the best model
scores = cross_val_score(catboost, x, y, cv=10, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Average accuracy:", scores.mean())

Cross-validation scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Average accuracy: 1.0


In [176]:
pickle.dump(catboost, open('../Artifacts/xgb-model.pkl', 'wb'))