In [64]:
import pandas as pd
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [65]:
#Merging all the datasets
import glob
all_files=glob.glob("/Users/abhinandanminajagi/Desktop/Dhriti/WeatherForcast/DataSets/*.csv")
li = []
for file in all_files:
    ds = pd.read_csv(file)
    li.append(ds)
ds = pd.concat(li)

In [66]:
features = ds.columns
to_keep = [feature for feature in features if feature in ['Location','MinTemp', 'MaxTemp', 'Rainfall','WindGustDir','WindGustSpeed','WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Temp9am','Temp3pm','RainToday','RainTomorrow']]
#Selecting the required columns
#A few had a lot of Nan values, so I haven't considered them
ds = ds[to_keep]  
ds.columns

Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustDir',
       'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [67]:
#Shuffling the dataset
from sklearn.utils import shuffle
ds = shuffle(ds)

In [68]:
ds

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
407,Portland,15.5,18.6,24.0,SE,31.0,SSE,SE,11.0,20.0,94.0,68.0,1013.9,1015.1,16.3,18.5,Yes,No
570,AliceSprings,2.3,25.7,0.0,N,26.0,,SW,0.0,7.0,66.0,23.0,1021.3,1016.0,10.9,25.2,No,No
1842,PerthAirport,16.7,25.5,0.0,SW,33.0,E,SW,13.0,20.0,72.0,51.0,1023.4,1019.6,19.4,24.5,No,No
551,Melbourne,14.5,21.4,0.0,SSE,44.0,SSW,SSE,24.0,28.0,57.0,56.0,1019.3,1019.5,16.5,19.2,No,No
1184,Watsonia,3.6,18.6,0.2,NNE,24.0,ENE,N,2.0,13.0,99.0,53.0,1024.7,1021.1,6.6,17.6,No,No
2795,Watsonia,11.8,20.4,0.0,WSW,28.0,W,SW,13.0,13.0,99.0,69.0,1009.7,1008.8,13.2,18.3,No,No
1914,MountGambier,13.6,17.5,9.8,NNW,52.0,N,NNW,20.0,20.0,88.0,70.0,1003.4,1000.5,14.8,15.5,Yes,Yes
543,Portland,5.9,10.1,0.0,NNW,54.0,NNW,NNW,28.0,30.0,68.0,89.0,1013.6,1008.9,8.1,9.4,No,Yes
730,Williamtown,16.0,34.8,0.0,,,NW,SE,15.0,22.0,71.0,35.0,1003.9,999.7,23.7,33.3,No,Yes
1683,Penrith,18.9,26.4,2.4,WSW,76.0,W,W,20.0,31.0,56.0,47.0,,,21.8,18.8,Yes,No


In [69]:
#Dividing the independent and dependent variables
X = ds.iloc[:, 0:17].values
y = ds.iloc[:, 17].values

In [72]:
#Encoding the data
X
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 4] = labelencoder_X_1.fit_transform(X[:, 4].astype(str))
onehotencoder = OneHotEncoder(categorical_features=[4])
labelencoder_X_2 = LabelEncoder()
X[:, 0] = labelencoder_X_2.fit_transform(X[:, 0].astype(str))
onehotencoder = OneHotEncoder(categorical_features=[0])
labelencoder_X_3 = LabelEncoder()
X[:, 6] = labelencoder_X_3.fit_transform(X[:, 6].astype(str))
onehotencoder = OneHotEncoder(categorical_features=[6])
labelencoder_X_4 = LabelEncoder()
X[:, 7] = labelencoder_X_4.fit_transform(X[:, 7].astype(str))
onehotencoder = OneHotEncoder(categorical_features=[7])
labelencoder_X_5 = LabelEncoder()
X[:, 16] = labelencoder_X_4.fit_transform(X[:, 16].astype(str))
onehotencoder = OneHotEncoder(categorical_features=[16])
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y.astype(str))

In [74]:
#Replacing missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(X[:, 0:17])
X[:, 0:17] = imputer.transform(X[:, 0:17])



In [75]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [76]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



In [77]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [78]:
#Initialising the ANN
classifier = Sequential()

In [79]:
#adding input layer & first hidden layer
classifier.add(Dense(output_dim=9, init = 'uniform', activation='relu', input_dim=17))

  


In [80]:
#adding the second hidden layer
classifier.add(Dense(output_dim=9, init = 'uniform', activation='relu'))

  


In [81]:
#adding the output layer
classifier.add(Dense(output_dim=1, init = 'uniform', activation='sigmoid'))

  


In [82]:
#compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
#fitting ANN to the training set
classifier.fit(X_train, y_train, batch_size=10, nb_epoch=10)

  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a3f1e1550>

In [84]:
#Making the prediction
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5) #if probability > 0.5, it is true, else it is false

In [85]:
#Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_test)*100)

84.36822463124223
