In [None]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv("weather_classification_data.csv")

In [None]:
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [6]:
df.duplicated().sum()

0

In [7]:
df.isna().sum()

Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [9]:
df.describe(include="object")

Unnamed: 0,Cloud Cover,Season,Location,Weather Type
count,13200,13200,13200,13200
unique,4,4,3,4
top,overcast,Winter,inland,Rainy
freq,6090,5610,4816,3300


In [10]:
for i in df.select_dtypes(include="object") :
    print(df[i].name , " : " , df[i].unique())

Cloud Cover  :  ['partly cloudy' 'clear' 'overcast' 'cloudy']
Season  :  ['Winter' 'Spring' 'Summer' 'Autumn']
Location  :  ['inland' 'mountain' 'coastal']
Weather Type  :  ['Rainy' 'Cloudy' 'Sunny' 'Snowy']


In [11]:
for i in df.select_dtypes(include="object") :
    df[i] = df[i].str.strip().str.capitalize()

In [12]:
for i in df.select_dtypes(include="object") :
   print( df.groupby(i)["Temperature"].unique())

Cloud Cover
Clear            [30.0, 38.0, 28.0, 24.0, 43.0, 13.0, 33.0, 29....
Cloudy           [29.0, 41.0, 13.0, 38.0, 49.0, -2.0, 5.0, 10.0...
Overcast         [27.0, 32.0, -2.0, 3.0, -10.0, 33.0, -7.0, 26....
Partly cloudy    [14.0, 39.0, 3.0, 35.0, 12.0, 10.0, 30.0, 38.0...
Name: Temperature, dtype: object
Season
Autumn    [38.0, 12.0, 33.0, 24.0, 35.0, 27.0, 42.0, 22....
Spring    [39.0, 30.0, 38.0, 35.0, 43.0, 28.0, -9.0, 21....
Summer    [32.0, 10.0, 13.0, 33.0, 26.0, 17.0, 36.0, 22....
Winter    [14.0, 27.0, -2.0, 3.0, 28.0, -10.0, 24.0, -7....
Name: Temperature, dtype: object
Location
Coastal     [38.0, 28.0, 24.0, 33.0, 40.0, 35.0, 27.0, 22....
Inland      [14.0, 39.0, 32.0, -2.0, 3.0, 38.0, -10.0, 30....
Mountain    [30.0, 27.0, 3.0, 35.0, 12.0, 10.0, 43.0, -10....
Name: Temperature, dtype: object
Weather Type
Cloudy    [39.0, 32.0, 35.0, 12.0, 10.0, 33.0, 26.0, 38....
Rainy     [14.0, 27.0, 17.0, 30.0, 28.0, 15.0, 11.0, 26....
Snowy     [-2.0, 3.0, -10.0, -7.0, 4.0, 2.0, -

In [13]:
df[(df["Precipitation (%)"] > 100) | (df["Humidity"]  > 100 )]

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
9,28.0,74,8.5,107.0,Clear,1012.13,8,Winter,7.5,Coastal,Sunny
19,13.0,102,12.0,72.0,Clear,1012.25,4,Summer,8.0,Inland,Sunny
24,38.0,83,7.0,101.0,Partly cloudy,1017.94,4,Spring,8.5,Mountain,Cloudy
37,2.0,105,19.0,109.0,Overcast,991.68,7,Winter,3.5,Mountain,Snowy
56,41.0,85,9.0,101.0,Overcast,1017.89,3,Summer,7.5,Mountain,Cloudy
...,...,...,...,...,...,...,...,...,...,...,...
13090,9.0,76,1.0,102.0,Overcast,982.64,13,Winter,0.5,Inland,Snowy
13113,42.0,106,32.5,103.0,Partly cloudy,996.45,14,Spring,1.0,Mountain,Rainy
13124,46.0,101,16.0,80.0,Partly cloudy,1015.06,11,Autumn,1.5,Inland,Rainy
13136,56.0,74,0.0,103.0,Partly cloudy,1012.40,10,Summer,5.5,Inland,Cloudy


In [14]:
data = df[(df["Precipitation (%)"].between(0,100)) & (df["Humidity"].between(0,100) )]

In [15]:
data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,Partly cloudy,1010.82,2,Winter,3.5,Inland,Rainy
1,39.0,96,8.5,71.0,Partly cloudy,1011.43,7,Spring,10.0,Inland,Cloudy
2,30.0,64,7.0,16.0,Clear,1018.72,5,Spring,5.5,Mountain,Sunny
3,38.0,83,1.5,82.0,Clear,1026.25,7,Spring,1.0,Coastal,Sunny
4,27.0,74,17.0,66.0,Overcast,990.67,1,Winter,2.5,Mountain,Rainy


In [16]:
data.columns = df.columns.str.strip()

In [17]:
data.columns

Index(['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)',
       'Cloud Cover', 'Atmospheric Pressure', 'UV Index', 'Season',
       'Visibility (km)', 'Location', 'Weather Type'],
      dtype='object')

In [18]:
data[data["UV Index"] == 0].describe(include= "object")

Unnamed: 0,Cloud Cover,Season,Location,Weather Type
count,2048,2048,2048,2048
unique,4,4,3,4
top,Overcast,Winter,Mountain,Snowy
freq,1511,1501,902,1307


In [19]:
clean_data = data[~((data["UV Index"] == 0 ) & (data["Weather Type"] == "Sunny")) ]

In [20]:
clean_data.shape

(12439, 11)

### Data visualization

Average temperatures by season

In [21]:
Avg_temperrature = clean_data.groupby("Season")["Temperature"].mean().reset_index()

In [22]:
Avg_temperrature

Unnamed: 0,Season,Temperature
0,Autumn,25.11026
1,Spring,25.111874
2,Summer,25.204213
3,Winter,9.718041


In [23]:
fig = px.bar(Avg_temperrature, x='Season', y='Temperature')
fig.show()

### Most common weather by location

In [24]:
clean_data.groupby("Location")["Weather Type"].agg(lambda x : x.mode()).reset_index()

Unnamed: 0,Location,Weather Type
0,Coastal,Rainy
1,Inland,Snowy
2,Mountain,Snowy


In [25]:
for i in clean_data.select_dtypes(include= "object") :
    df2 = clean_data.groupby(i).agg(Avg_Temperature = ("Temperature" , "mean") , Avg_Humidity = ("Humidity" , "mean")).reset_index()
    fig = px.bar(df2, x= i, y= "Avg_Temperature" , color="Avg_Humidity")
    fig.show()
    

In [26]:
clean_data["Weather Type"].value_counts(normalize=True)

Weather Type
Snowy     0.251628
Rainy     0.250502
Cloudy    0.250261
Sunny     0.247608
Name: proportion, dtype: float64

### Split data to train and test 

In [27]:
X = clean_data.drop("Weather Type" , axis= 1)
y = clean_data["Weather Type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= y)

In [28]:
X_train.head(10)

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location
6463,3.0,65,19.5,78.0,Overcast,981.31,1,Winter,1.5,Mountain
6396,-9.0,66,18.5,62.0,Overcast,985.51,1,Winter,3.5,Mountain
12450,1.0,99,18.5,94.0,Overcast,985.95,0,Winter,2.5,Inland
3465,28.0,32,5.0,2.0,Clear,1028.19,10,Summer,9.5,Inland
12097,30.0,69,20.0,80.0,Overcast,995.21,3,Autumn,3.5,Inland
12527,33.0,93,18.5,93.0,Overcast,1013.71,2,Summer,3.0,Mountain
8950,31.0,76,14.0,20.0,Overcast,1010.76,3,Autumn,7.5,Coastal
59,16.0,84,18.5,54.0,Overcast,996.12,0,Spring,3.0,Coastal
2623,27.0,87,14.0,58.0,Overcast,1016.41,0,Spring,2.5,Inland
5033,40.0,53,4.5,67.0,Partly cloudy,1143.82,11,Summer,10.5,Coastal


In [29]:
y_train.head()

6463     Snowy
6396     Snowy
12450    Snowy
3465     Sunny
12097    Rainy
Name: Weather Type, dtype: object

### Data processing


In [30]:
normalizer = MinMaxScaler()


In [31]:
df.dtypes

Temperature             float64
Humidity                  int64
Wind Speed              float64
Precipitation (%)       float64
Cloud Cover              object
Atmospheric Pressure    float64
UV Index                  int64
Season                   object
Visibility (km)         float64
Location                 object
Weather Type             object
dtype: object

In [32]:
X_test.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location
3944,20.0,38,7.0,4.0,Clear,1027.98,8,Winter,8.5,Mountain
1744,41.0,28,5.0,5.0,Clear,1014.67,7,Winter,9.5,Mountain
10425,44.0,67,1.5,17.0,Partly cloudy,1021.59,6,Spring,7.5,Mountain
3048,32.0,60,2.5,0.0,Clear,1020.69,10,Spring,9.0,Inland
6504,7.0,37,14.5,84.0,Cloudy,816.07,1,Spring,2.0,Mountain


In [33]:
num_columns = X_train.select_dtypes(include=["float64" , "int64"]).columns.tolist()

normalizer.fit(X_train[num_columns])
X_train[num_columns] = normalizer.transform(X_train[num_columns])
X_test[num_columns] = normalizer.transform(X_test[num_columns])

In [34]:
X_train.head(10)

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location
6463,0.203008,0.5625,0.402062,0.78,Overcast,0.454008,0.071429,Winter,0.075,Mountain
6396,0.112782,0.575,0.381443,0.62,Overcast,0.464532,0.071429,Winter,0.175,Mountain
12450,0.18797,0.9875,0.381443,0.94,Overcast,0.465634,0.0,Winter,0.125,Inland
3465,0.390977,0.15,0.103093,0.02,Clear,0.571475,0.714286,Summer,0.475,Inland
12097,0.406015,0.6125,0.412371,0.8,Overcast,0.488837,0.214286,Autumn,0.175,Inland
12527,0.428571,0.9125,0.381443,0.93,Overcast,0.535193,0.142857,Summer,0.15,Mountain
8950,0.413534,0.7,0.28866,0.2,Overcast,0.527801,0.214286,Autumn,0.375,Coastal
59,0.300752,0.8,0.381443,0.54,Overcast,0.491117,0.0,Spring,0.15,Coastal
2623,0.383459,0.8375,0.28866,0.58,Overcast,0.541958,0.0,Spring,0.125,Inland
5033,0.481203,0.4125,0.092784,0.67,Partly cloudy,0.861209,0.785714,Summer,0.525,Coastal


In [36]:
from category_encoders.one_hot import OneHotEncoder

In [37]:
objects_colums = X_train.select_dtypes(include="object").columns.tolist()
oneEnco = OneHotEncoder(cols= objects_colums ,  handle_unknown= "ignore" , return_df= True)
oneEnco.fit(X_train)
X_train = oneEnco.transform(X_train)
X_test = oneEnco.transform(X_test)

In [38]:
X_train.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover_1,Cloud Cover_2,Cloud Cover_3,Cloud Cover_4,Atmospheric Pressure,UV Index,Season_1,Season_2,Season_3,Season_4,Visibility (km),Location_1,Location_2,Location_3
6463,0.203008,0.5625,0.402062,0.78,1,0,0,0,0.454008,0.071429,1,0,0,0,0.075,1,0,0
6396,0.112782,0.575,0.381443,0.62,1,0,0,0,0.464532,0.071429,1,0,0,0,0.175,1,0,0
12450,0.18797,0.9875,0.381443,0.94,1,0,0,0,0.465634,0.0,1,0,0,0,0.125,0,1,0
3465,0.390977,0.15,0.103093,0.02,0,1,0,0,0.571475,0.714286,0,1,0,0,0.475,0,1,0
12097,0.406015,0.6125,0.412371,0.8,1,0,0,0,0.488837,0.214286,0,0,1,0,0.175,0,1,0


### Model Training


In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
logistic = LogisticRegression()
logistic.fit(X_train , y_train)


In [41]:
y_train_pre = logistic.predict(X_train)
y_test_pre = logistic.predict(X_test)

In [42]:
y_test_pre


array(['Sunny', 'Sunny', 'Cloudy', ..., 'Sunny', 'Rainy', 'Rainy'],
      dtype=object)

In [43]:
from sklearn.metrics import accuracy_score , classification_report
print("Train accuracy : " , accuracy_score(y_train , y_train_pre))
print ("Test accuracy : " , accuracy_score(y_test, y_test_pre))



Train accuracy :  0.8731785750175862
Test accuracy :  0.8786173633440515


In [44]:
print(classification_report(y_test , y_test_pre))

              precision    recall  f1-score   support

      Cloudy       0.85      0.85      0.85       623
       Rainy       0.87      0.83      0.85       623
       Snowy       0.88      0.93      0.90       626
       Sunny       0.92      0.90      0.91       616

    accuracy                           0.88      2488
   macro avg       0.88      0.88      0.88      2488
weighted avg       0.88      0.88      0.88      2488



In [45]:
from sklearn.tree import DecisionTreeClassifier

D_tree = DecisionTreeClassifier(max_depth= 4 , )
D_tree.fit(X_train , y_train)
y_p_train = D_tree.predict(X_train)
y_p_test = D_tree.predict(X_test)


In [46]:
print("Train accuracy : ", accuracy_score(y_train , y_p_train))
print("Test accuracy : " , accuracy_score(y_test , y_p_test))

Train accuracy :  0.9039292533413728
Test accuracy :  0.9047427652733119


In [47]:
print(classification_report(y_test , y_p_test))

              precision    recall  f1-score   support

      Cloudy       0.98      0.83      0.90       623
       Rainy       0.86      0.92      0.89       623
       Snowy       0.94      0.93      0.94       626
       Sunny       0.86      0.93      0.89       616

    accuracy                           0.90      2488
   macro avg       0.91      0.90      0.90      2488
weighted avg       0.91      0.90      0.90      2488



In [48]:
print(classification_report(y_train , y_p_train))

              precision    recall  f1-score   support

      Cloudy       0.97      0.83      0.90      2490
       Rainy       0.86      0.93      0.89      2493
       Snowy       0.95      0.92      0.93      2504
       Sunny       0.85      0.93      0.89      2464

    accuracy                           0.90      9951
   macro avg       0.91      0.90      0.90      9951
weighted avg       0.91      0.90      0.90      9951



In [49]:
parameters = {
    "max_depth" : [4, 2, 7, 8, 10 ,3],
    "min_samples_leaf" : [6, 2, 7, 3, 5,4],
    "min_samples_split" : [4, 2, 7, 8, 10,3],

}

In [50]:
random_search = RandomizedSearchCV(
    D_tree,
    param_distributions= parameters,
    n_iter=60,
    scoring="accuracy",
    cv = 5,
    n_jobs=-1
)
random_search.fit(X_train, y_train)


In [51]:
random_search.best_params_

{'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 7}

In [52]:
model = random_search.best_estimator_
y_predit_test = model.predict(X_test)
y_predit_tarin = model.predict(X_train)

In [None]:
accuracy_score(y_test, y_predit_test)

0.9196141479099679

In [None]:
accuracy_score(y_train , y_predit_tarin)

0.9335745151241082

In [None]:
print(classification_report(y_test , y_predit_test))

              precision    recall  f1-score   support

      Cloudy       0.83      0.94      0.88       623
       Rainy       0.94      0.90      0.92       623
       Snowy       0.96      0.93      0.94       626
       Sunny       0.97      0.91      0.94       616

    accuracy                           0.92      2488
   macro avg       0.92      0.92      0.92      2488
weighted avg       0.92      0.92      0.92      2488



In [None]:
print(classification_report(y_train , y_predit_tarin))

              precision    recall  f1-score   support

      Cloudy       0.84      0.97      0.90      2490
       Rainy       0.96      0.91      0.94      2493
       Snowy       0.97      0.94      0.95      2504
       Sunny       0.98      0.92      0.95      2464

    accuracy                           0.93      9951
   macro avg       0.94      0.93      0.93      9951
weighted avg       0.94      0.93      0.93      9951

