<a href="https://colab.research.google.com/github/Deeyadav2001/Weather-Prediction/blob/main/Weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data= pd.read_csv("/content/drive/MyDrive/Ind_2_sol/dataAUS.csv")
data.head()

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Day'] = data['Date'].dt.day
data['Year'] = data['Date'].dt.year

In [None]:
data.drop('Date',axis=1,inplace=True)

In [None]:
numerical_columns=data._get_numeric_data().columns
print('Numerical columns:', list(numerical_columns))
print('\n')

#CATEGORICAL COLUMNS
categorical_columns=list(set(data)-set(numerical_columns))
print('Categorical colums: ',categorical_columns)

In [None]:
plt.figure(figsize=(18,8))

#Define the mask to set the values in the upper triangle to True
mask_1=np.triu(np.ones_like(data.corr(),dtype=np.bool))   
heatmap=sns.heatmap(data.corr(),annot=True,cmap='coolwarm',mask=mask_1)
heatmap.set_title('Correlation heatmap',fontdict={'fontsize':16})

In [None]:
fig,axes=plt.subplots(2,2,figsize=(20,20))
sns.set_style('whitegrid')

#Maximum Temperature
plt.subplot(2,2,1)
plt.title('Maximum Temperature variation over the years',fontweight='bold',fontsize=20)
plt.xlabel('MaxTemp',fontweight='bold',fontsize=16)
sns.distplot(data['MaxTemp'],color='indigo',bins=25)

#Minimum Temperature
plt.subplot(2,2,2)
plt.title('Minimum Temperature variation over the years',fontweight='bold',fontsize=20)
plt.xlabel('MinTemp',fontweight='bold',fontsize=16)
sns.distplot(data['MinTemp'],color='blue')

#Sunshine
plt.subplot(2,2,3)
sns.distplot(data['Sunshine'],bins=50,color='green')
plt.title('Sunshine over the years',fontweight='bold',fontsize=20)
plt.xlabel('Sunshine',fontweight='bold',fontsize=16)

#Evaporation
plt.subplot(2,2,4)
sns.distplot(data['Evaporation'],bins=50,color='red')
plt.title('Evaporation over the years',fontweight='bold',fontsize=20)
plt.xlabel('Evaporation',fontweight='bold',fontsize=16)


print('\033[1m'+'The mean max. temp. is: ', data['MaxTemp'].mean())
print('\033[1m'+'The mean min. temp. is: ', data['MinTemp'].mean())
print('\033[1m'+'The mean sunshine over the years is: ', data['Sunshine'].mean())
print('\033[1m'+'The mean evaporation over the years is: ',data['Evaporation'].mean())

In [None]:
data['MinTemp'].fillna(data['MinTemp'].mean(),inplace=True)
data['MaxTemp'].fillna(data['MaxTemp'].mean(),inplace=True)
data['Evaporation'].fillna(data['Evaporation'].mean(),inplace=True)
data['Sunshine'].fillna(data['Sunshine'].mean(),inplace=True)

In [None]:
plt.figure(figsize=(18,6))
sns.distplot(data['Rainfall'],bins=50,color='black')
plt.title('Rainfall over the years',fontweight='bold',fontsize=16)
plt.show()


print('\033[1m'+'The mean rainfall over the years is: ',data['Rainfall'].mean())

In [None]:
data['Rainfall'].fillna(data['Rainfall'].mean(),inplace=True)
data['WindSpeed9am'].fillna(data['WindSpeed9am'].mean(),inplace=True)
data['WindSpeed3pm'].fillna(data['WindSpeed3pm'].mean(),inplace=True)
data['Humidity9am'].fillna(data['Humidity9am'].mean(),inplace=True)
data['Humidity3pm'].fillna(data['Humidity3pm'].mean(),inplace=True)
data['Pressure9am'].fillna(data['Pressure9am'].mean(),inplace=True)
data['Pressure3pm'].fillna(data['Pressure3pm'].mean(),inplace=True)
data['Cloud9am'].fillna(data['Cloud9am'].mean(),inplace=True)
data['Cloud3pm'].fillna(data['Cloud3pm'].mean(),inplace=True)
data['Temp9am'].fillna(data['Temp9am'].mean(),inplace=True)
data['Temp3pm'].fillna(data['Temp3pm'].mean(),inplace=True)
data['WindGustSpeed'].fillna(data['WindGustSpeed'].mean(),inplace=True)

In [None]:
data['WindGustDir'].fillna('W',inplace=True)
data['WindDir9am'].fillna('N',inplace=True)
data['WindDir3pm'].fillna('SE',inplace=True)

In [None]:
plt.figure(figsize=(18,10))
sns.heatmap(data[numerical_columns].corr(),annot=True)

In [None]:
for i in data:
    if data[i].dtype=='float64':
        q1 = data[i].quantile(0.25)
        q3 = data[i].quantile(0.75)
        iqr = q3-q1
        Lower_tail = q1 - 1.5 * iqr
        Upper_tail = q3 + 1.5 * iqr
        med = np.median(data[i])
        for j in data[i]:
            if j > Upper_tail or j < Lower_tail:
                data[i] = data[i].replace(j, med)
    else:
        continue

**Converting Categorical Data to Numerical Data**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
categorical_columns=['RainTomorrow', 'WindDir3pm', 'WindGustDir', 'WindDir9am', 'RainToday', 'Location']
for i in categorical_columns:
    data[i]=label_encoder.fit_transform(data[i])

In [None]:
X=data.drop('RainTomorrow',axis=1)
y=data['RainTomorrow']

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train.head(2)
print('Shape of X_train:', X_train.shape)
print('Shape of X_test:', X_test.shape)

##Model Building and Evalution

###Linear Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(max_iter=1000)

In [None]:
data = data.reset_index()
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.mean, inplace=True)

In [None]:
lr.fit(X_train,y_train)
predict=lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(predict,y_test))

In [None]:
print(confusion_matrix(predict,y_test))

In [None]:
lr.score(X_test,y_test)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:
prediction=dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(prediction,y_test))

In [None]:
print(confusion_matrix(prediction,y_test))

In [None]:
dtree.score(X_test,y_test)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 0)
rf.fit(X_train,y_train)

In [None]:
RandomForestRegressorScore = rf.score(X_test,y_test)

In [None]:
print(classification_report(prediction,y_test))

In [None]:
print(confusion_matrix(prediction,y_test))

In [None]:
rf.score(X_test,y_test)

### XGBoost Classifier

In [None]:
from xgboost import  XGBClassifier
xgb_model=XGBClassifier()

In [None]:
xgb_model.fit(X_train,y_train)

In [None]:
predictions_xgb=xgb_model.predict(X_test)

In [None]:
xgb_model.score(X_test,y_test)

### KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn=KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train,y_train)
predictions=knn.predict(X_test)

In [None]:
classification_report(predictions,y_test)
confusion_matrix(predictions,y_test)

In [None]:
knn.score(X_test,y_test)