# Revising the dataset

In [1]:
import numpy as np
import pandas as pd 

In [2]:
dataset = pd.read_csv(r'dataset.csv')
dataset.head()

Unnamed: 0,temp_c,condition,wind_kph,pressure_mb,humidity,feelslike_c,heatindex_c
0,24.5,Clear,4.7,1004.0,41.0,25.1,25.1
1,24.2,Clear,4.7,1004.0,41.0,24.9,24.9
2,23.8,Clear,4.7,1004.0,41.0,24.8,24.8
3,23.5,Clear,4.7,1004.0,42.0,24.6,24.6
4,23.2,Clear,4.3,1004.0,43.0,24.6,24.6


In [3]:
dataset['condition'].unique()

array(['Clear', 'Sunny', 'Partly cloudy', 'Patchy rain possible',
       'Thundery outbreaks possible', 'Patchy light rain with thunder',
       'Light rain shower', 'Moderate rain',
       'Moderate or heavy rain shower', 'Light rain', 'Light drizzle',
       'Cloudy', 'Overcast', 'Mist', 'Fog'], dtype=object)

In [4]:
dataset['condition'].value_counts()

condition
Clear                             334
Sunny                             293
Light rain shower                 144
Partly cloudy                     141
Patchy rain possible               57
Overcast                           51
Cloudy                             33
Mist                               33
Thundery outbreaks possible        15
Moderate or heavy rain shower      15
Moderate rain                       9
Light rain                          9
Light drizzle                       9
Fog                                 6
Patchy light rain with thunder      3
Name: count, dtype: int64

# SMOTE Oversampling

In [5]:
x = dataset.drop('condition' , axis = 1)
y = dataset['condition']

In [6]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=20 , k_neighbors = 2)
x , y = sm.fit_resample(x , y)

In [7]:
y.value_counts()

condition
Clear                             334
Sunny                             334
Partly cloudy                     334
Patchy rain possible              334
Thundery outbreaks possible       334
Patchy light rain with thunder    334
Light rain shower                 334
Moderate rain                     334
Moderate or heavy rain shower     334
Light rain                        334
Light drizzle                     334
Cloudy                            334
Overcast                          334
Mist                              334
Fog                               334
Name: count, dtype: int64

# Encoding the condition parameter

In [8]:
from sklearn.preprocessing import LabelEncoder
condition_encoder = LabelEncoder()
dataset['condition'] = condition_encoder.fit_transform(dataset['condition'])
dataset.head()

Unnamed: 0,temp_c,condition,wind_kph,pressure_mb,humidity,feelslike_c,heatindex_c
0,24.5,0,4.7,1004.0,41.0,25.1,25.1
1,24.2,0,4.7,1004.0,41.0,24.9,24.9
2,23.8,0,4.7,1004.0,41.0,24.8,24.8
3,23.5,0,4.7,1004.0,42.0,24.6,24.6
4,23.2,0,4.3,1004.0,43.0,24.6,24.6


In [9]:
dataset['condition'].unique()

array([ 0, 13, 10, 12, 14, 11,  5,  8,  7,  4,  3,  1,  9,  6,  2])

In [10]:
condition_encoder.inverse_transform([ 0, 13, 10, 12, 14, 11,  5,  8,  7,  4,  3,  1,  9,  6,  2])

array(['Clear', 'Sunny', 'Partly cloudy', 'Patchy rain possible',
       'Thundery outbreaks possible', 'Patchy light rain with thunder',
       'Light rain shower', 'Moderate rain',
       'Moderate or heavy rain shower', 'Light rain', 'Light drizzle',
       'Cloudy', 'Overcast', 'Mist', 'Fog'], dtype=object)

# Pickling the encoder object

In [11]:
import pickle as pkl
#file1 = open('encoder_condition.pkl' , 'wb')
#pkl.dump(condition_encoder , file1)

# Train Test Split

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x , y , test_size=.3 , random_state=42)

# Classifier Model Evaluation

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report
models = [ DecisionTreeClassifier() , LogisticRegression() , KNeighborsClassifier() , GaussianNB() , SVC() ]

In [14]:
def model_evaluation(models):
    scores = []
    for model in models :
        print("\n=============================================================================\n")
        print(f"model : {model}")
        model.fit(X_train , y_train)
        y_pred = model.predict(X_test)
        report = classification_report(y_test , y_pred)
        score = accuracy_score(y_test , y_pred)
        scores.append(score)
        print(f"accuracy_score : {score}")
        print(f"classification_report : \n{report}")
    print("\n=============================================================================")
    return scores

In [15]:
import warnings
warnings.filterwarnings("ignore")
score = model_evaluation(models)



model : DecisionTreeClassifier()
accuracy_score : 0.9248170326014638
classification_report : 
                                precision    recall  f1-score   support

                         Clear       0.84      0.85      0.84       104
                        Cloudy       0.95      0.91      0.93       101
                           Fog       0.95      0.96      0.96       107
                 Light drizzle       0.98      1.00      0.99        96
                    Light rain       1.00      1.00      1.00       103
             Light rain shower       0.89      0.84      0.87       107
                          Mist       0.97      0.98      0.97        90
 Moderate or heavy rain shower       0.95      0.93      0.94        89
                 Moderate rain       1.00      1.00      1.00        92
                      Overcast       0.82      0.96      0.89       109
                 Partly cloudy       0.81      0.82      0.82        96
Patchy light rain with thunder       0.

In [16]:
result_dict = { "MODEL" : models , "SCORE" : score}
result = pd.DataFrame(result_dict)
result

Unnamed: 0,MODEL,SCORE
0,DecisionTreeClassifier(),0.924817
1,LogisticRegression(),0.51497
2,KNeighborsClassifier(),0.92016
3,GaussianNB(),0.520293
4,SVC(),0.089155


# GridSearchCV

In [17]:
from sklearn.model_selection import GridSearchCV
def model_evaluation(models,params):
    scores = []
    best_params = []
    best_score = []
    for i in range(len(models)) :
        print("\n=============================================================================\n")
        print(f"model : {models[i]}")
        Grid = GridSearchCV(estimator=models[i] , param_grid=params[i] , scoring='accuracy')
        Grid.fit(X_train , y_train)
        y_pred = Grid.predict(X_test)
        report = classification_report(y_test , y_pred)
        accuracy = accuracy_score(y_test , y_pred)
        scores.append(accuracy)
        best_params.append(Grid.best_params_)
        best_score.append(Grid.best_score_)
        print(f"best_params_ : {Grid.best_params_}")
        print(f"best_score_ : {Grid.best_score_}")
        print(f"accuracy_score : {accuracy}")
        print(f"classification_report : \n{report}")
    print("\n=============================================================================")
    return scores,best_params,best_score

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report
models = [ DecisionTreeClassifier() , LogisticRegression() , KNeighborsClassifier() ]
params=[ { "criterion" : ['gini', 'entropy', 'log_loss'],
          'splitter' : ['best', 'random'],
          'max_depth' : [ 2 , 3 , 5 , 10 ]} , 
        
        { 'penalty' : ['l1', 'l2', 'elasticnet'],
         'dual' : [True , False ] , 
         'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']} ,
        
        { 'n_neighbors' : [5,7,10,15],
         'weights' :['uniform', 'distance'],
         'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']}]

In [19]:
score , best_param , best_score = model_evaluation(models , params)



model : DecisionTreeClassifier()
best_params_ : {'criterion': 'log_loss', 'max_depth': 10, 'splitter': 'best'}
best_score_ : 0.8773880211826004
accuracy_score : 0.9001996007984032
classification_report : 
                                precision    recall  f1-score   support

                         Clear       0.85      0.78      0.81       104
                        Cloudy       0.82      0.79      0.80       101
                           Fog       0.99      0.99      0.99       107
                 Light drizzle       0.99      1.00      0.99        96
                    Light rain       1.00      1.00      1.00       103
             Light rain shower       0.72      0.78      0.75       107
                          Mist       0.99      1.00      0.99        90
 Moderate or heavy rain shower       0.94      0.96      0.95        89
                 Moderate rain       1.00      0.98      0.99        92
                      Overcast       0.92      0.87      0.90       109


In [20]:
result_dict_ = { "MODEL" : models , "SCORE" : score , "BEST_PARAMS_" : best_param , "BEST_SCORE_" : best_score}
result_ = pd.DataFrame(result_dict_)
result_

Unnamed: 0,MODEL,SCORE,BEST_PARAMS_,BEST_SCORE_
0,DecisionTreeClassifier(),0.9002,"{'criterion': 'log_loss', 'max_depth': 10, 'sp...",0.877388
1,LogisticRegression(),0.596806,"{'dual': False, 'penalty': 'l2', 'solver': 'ne...",0.579987
2,KNeighborsClassifier(),0.94145,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.932132


In [21]:
final_result = pd.concat([result , result_] , ignore_index=True)
final_result

Unnamed: 0,MODEL,SCORE,BEST_PARAMS_,BEST_SCORE_
0,DecisionTreeClassifier(),0.924817,,
1,LogisticRegression(),0.51497,,
2,KNeighborsClassifier(),0.92016,,
3,GaussianNB(),0.520293,,
4,SVC(),0.089155,,
5,DecisionTreeClassifier(),0.9002,"{'criterion': 'log_loss', 'max_depth': 10, 'sp...",0.877388
6,LogisticRegression(),0.596806,"{'dual': False, 'penalty': 'l2', 'solver': 'ne...",0.579987
7,KNeighborsClassifier(),0.94145,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.932132


In [23]:
final_result.iloc[7]['BEST_PARAMS_']

{'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'distance'}