In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
data = pd.read_csv('/kaggle/input/did-it-rain-in-seattle-19482017/seattleWeather_1948-2017.csv') 
data.head()

In [3]:
#1. Drop the PRCP
data=data.drop(["PRCP"],axis=1)

# 2 Converting object into datetime to extract day, month and year
from datetime import datetime
data["DATE"]=pd.to_datetime(data["DATE"], format= "%Y-%m-%d")

# Extract day, month and year
data["DAY"]=data["DATE"].dt.day
data["MONTH"]=data["DATE"].dt.month
data["YEAR"]=data["DATE"].dt.year
data=data.drop(["DATE"], axis=1)

#Rearrange columns
data=data[["DAY", "MONTH", "YEAR", "TMAX", "TMIN", "RAIN"]]
data.head()

In [4]:
x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [5]:
#Label Encoding for RAIN column
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
y

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [7]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.transform(x_test)

In [8]:
from sklearn import metrics

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score

In [9]:
all_classifiers = {'Gradient Boost': GradientBoostingClassifier(),
                 'Ada Boost': AdaBoostClassifier(),
                 'Random Forest': RandomForestClassifier(n_estimators=50, min_samples_leaf=2, min_samples_split=4, max_depth=6),
                 'Logistic Regression': LogisticRegression(),
                 'Decision Tree' : DecisionTreeClassifier(),
                 'KNN': KNeighborsClassifier(),
                 'Gaussian NB': GaussianNB(),
                 'Beroulli  NB': BernoulliNB(),
                  'SVC': SVC(probability = True)} 
for Name,classifier in all_classifiers.items():
    classifier.fit(x_train,y_train)
    print(Name + " trained.")

In [10]:
ML_name = []
ML_accuracy = []
for Name,classifier in all_classifiers.items():
    y_pred = classifier.predict(x_test)
    ML_accuracy.append(metrics.accuracy_score(y_test,y_pred)) 
    ML_name.append(Name) 
    print(Name + " R^2: {:.5f}".format(metrics.accuracy_score(y_test,y_pred)))

In [11]:
from pylab import rcParams

rcParams['figure.figsize'] = 8, 4
plt.barh(ML_name, ML_accuracy, color = 'purple')
plt.xlabel('Accuracy Score', fontsize = '14')
plt.ylabel('Machine Learning Algorithms', fontsize = '14')
plt.xlim([0.7, 0.84])
plt.show()
#code is taken from here: https://www.kaggle.com/ehsaner/predicting-seattle-rain-part-2

## Graphs

In [12]:
plt.figure(figsize=(15,6))
sns.set_theme(style="whitegrid")
sns.lineplot(x=data.YEAR, y=data.TMIN)
sns.lineplot(x=data.YEAR, y=data.TMAX)
plt.xlabel('Years', fontsize=14, fontfamily='monospace')
plt.ylabel('Temperature -  Fahrenheit', fontsize=14, fontfamily='monospace')
plt.title('Maximum and Minimum Temperature through the years', fontsize=20, fontfamily='monospace');

# Inspiration and code from this notebook: https://www.kaggle.com/danielivanovski/catboost-100-accuracy

In [13]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px


In [14]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
data = pd.read_csv('/kaggle/input/did-it-rain-in-seattle-19482017/seattleWeather_1948-2017.csv') 
# 2 Converting object into datetime to extract day, month and year
from datetime import datetime
data["DATE"]=pd.to_datetime(data["DATE"], format= "%Y-%m-%d")

# Extract day, month and year
data["DAY"]=data["DATE"].dt.day
data["MONTH"]=data["DATE"].dt.month
data["YEAR"]=data["DATE"].dt.year
data=data.drop(["DATE"], axis=1)

In [15]:
df=(data
 .groupby("YEAR")
 [["TMIN", "TMAX", "PRCP"]]
 .mean()
)

# Plot bubble chart
fig = px.scatter(df, x='TMIN', y="TMAX", 
            title="Seattle Average Temperature v. Precipitation by Year, 1948-2017",size='PRCP', color=df.index)

fig.update_layout(
    xaxis=dict(title='Min. Temperature (F)'),
    yaxis=dict(title='Max. Temperature (F)')
)

fig.show()
## Code FROM the notebook:https: //www.kaggle.com/chalseo/seattle-historical-precipitation-1948-2010

In [16]:
# Create a figure with subplots
fig4 = make_subplots(specs=[[{"secondary_y": True}]])

# Get data for each axis
X = df.index
Y = df['PRCP']
Y2 = df['TMIN']
Y3 = df['TMAX']

# Add traces and specify secondary axis
fig4.add_trace(go.Scatter(x=X, y=Y, name="Precip. %", line_color='black'), secondary_y=True)
fig4.add_trace(go.Scatter(x=X, y=Y2, name="Min Temp F", fill='tonexty', mode='lines', line=dict(color='paleturquoise')), secondary_y=False)
fig4.add_trace(go.Scatter(x=X, y=Y3, name="Max Temp F", fill='tonexty', mode='lines', line=dict(color='teal')), secondary_y=False)

# Update chart labels
fig4.update_layout(title_text="Seattle Average Precipitation & Temperature by Year, 1948-2017")
fig4.update_xaxes(title_text="Year")
fig4.update_yaxes(title_text="Precipitation %", secondary_y=True)
fig4.update_yaxes(title_text="Temperature (F)", secondary_y=False)

fig4.show()
## Code FROM the notebook:https: //www.kaggle.com/chalseo/seattle-historical-precipitation-1948-2010

In [18]:
sns.pairplot(data, hue="RAIN")

In [19]:
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")

ax.set_xlabel("TMIN")
ax.set_ylabel("TMAX")
ax.set_zlabel("PRCP")
ax.scatter3D(xs=data["TMIN"],ys=data["TMAX"],zs=data["PRCP"], c=data["RAIN"])
plt.show()

# I took this part of the code from here: https://www.kaggle.com/bcghost/pe05-shanshanyu as I liked the 3D visualization