<a href="https://colab.research.google.com/github/BartekMasiak/Python-in-ML-DS-Seul-Bike-Sharing-Dateset/blob/main/klasyfikator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# importing libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import statistics

from sklearn import svm, tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

"""
Part 1. -- Importing data and preparing it for further processing.
"""

# importing dataset
DATAFILE='/content/drive/MyDrive/SeoulBikeData.csv'
data = pd.read_csv(DATAFILE, encoding='unicode_escape')


# changing date format
data.Date = pd.to_datetime( data.Date, format="%d/%m/%Y")


# cleaning data
def cleanse(x):
    return re.sub(r'\(.*$','',x).strip().replace(' ','_')
data.columns = data.columns.to_series().apply(cleanse)


# deleting dew point & solar radiation
data = data.drop(columns='Dew_point_temperature')
data = data.drop(columns='Solar_Radiation')


# changing categorical data into numerical
data['Holiday'].replace(['Holiday', 'No Holiday'], [1, 0], inplace=True)
data['Seasons'].replace(['Spring', 'Summer', 'Autumn', 'Winter'], [1, 2, 3, 4], inplace=True)

# deleting duplicates
data = data.drop_duplicates()

# changing date format to numerical
data['Date'] = pd.to_numeric(pd.to_datetime(data['Date']))


# deleting non functioning days
non_functioning_days = data.query('Functioning_Day=="No"').Date.unique()
data = data[ ~data.Date.isin(non_functioning_days)]
data.drop(columns='Functioning_Day', inplace=True)


# merging Rainfall and Snowfall into one feature - Precipitation
data['Precipitation'] = data['Rainfall'] + 10*data['Snowfall']
data['Precipitation'].copy()
data = data.drop(columns=['Rainfall','Snowfall'])

"""
Making Precipitation binary. We assume that if Precipitation is higher than 1 we
consider a day to be rainy/snowy.
"""
data['Precipitation'] = data['Precipitation'] > 1


# comment or uncomment line belowe whether you need data without or with Rentet Bike Count
#data = data.drop(columns='Rented_Bike_Count')


# normalization
minmax_scaler = MinMaxScaler().set_output(transform="pandas")
data_normalized = minmax_scaler.fit_transform(data)

# splitting data not normalized
train_data, test_data = train_test_split(data, test_size=0.5)
test_data, val_data = train_test_split(train_data, test_size=0.6)

# splitting data normalized
train_data_normalized, test_data_normalized = train_test_split(data_normalized, test_size=0.5)
test_data_normalized, val_data_normalized = train_test_split(train_data, test_size=0.6)

"""
Part 2. -- Buliding model & outcomes assesment.
"""

# Choosing classifier. (uncomment needed)


#clf = svm.SVC(kernel='rbf')
#clf_name = 'SVM'

#clf = MLPClassifier()
#clf_name = 'MLP'

#clf = KNeighborsClassifier(n_neighbors=3)
#clf_name = 'K Neighbors'

clf = tree.DecisionTreeClassifier(max_depth=3)
clf_name = 'Decision Tree'

print("Classifier type: ", clf_name, "\n")




""" Implementing classifier for not normalized data."""
# Dividing data:
X_train = train_data.drop('Precipitation', axis = 1)
Y_train = train_data['Precipitation']

X_test = test_data.drop('Precipitation', axis = 1)
Y_test = test_data['Precipitation']

X_val = val_data.drop('Precipitation', axis = 1)
Y_val = val_data['Precipitation']

# Building classifier & calculating predicted values:
clf.fit(X_train, Y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_val = clf.predict(X_val)

# Calculating accuracy:
acc_train = accuracy_score(Y_train, y_pred_train)
acc_test = accuracy_score(Y_test, y_pred_test)
acc_val = accuracy_score(Y_val, y_pred_val)
CM = confusion_matrix(Y_test, y_pred_test)



print("----- NOT NORMALIZED DATA -----")
print("\nAccuracies(train/test/validation):")
print(acc_train)
print(acc_test)
print(acc_val)

print("\nConfusion Matrix:")
print(CM)

# Cross validation
scores = cross_validate(clf, X_train,Y_train, cv=5, scoring=['accuracy','f1','recall','precision'])
print("\nCross validation:")
print("CV: ", 5)
print("Accuracy: ", statistics.mean(scores['test_accuracy']))
print("F1: ", statistics.mean(scores['test_f1']))
print("Recall: ", statistics.mean(scores['test_recall']))
print("Precision: ", statistics.mean(scores['test_precision']))





""" Implementing classifier for normalized data."""
# Dividing data:
X_train_normalized = train_data_normalized.drop('Precipitation', axis = 1)
Y_train_normalized = train_data_normalized['Precipitation']

X_test_normalized = test_data_normalized.drop('Precipitation', axis = 1)
Y_test_normalized = test_data_normalized['Precipitation']

X_val_normalized = val_data_normalized.drop('Precipitation', axis = 1)
Y_val_normalized = val_data_normalized['Precipitation']

# Building classifier & calculating predicted values:
clf.fit(X_train_normalized, Y_train_normalized)
y_pred_train_normalized = clf.predict(X_train_normalized)
y_pred_test_normalized = clf.predict(X_test_normalized)
y_pred_val_normalized = clf.predict(X_val_normalized)

# Calculating accuracy:
acc_train_normalized = accuracy_score(Y_train_normalized, y_pred_train_normalized)
acc_test_normalized = accuracy_score(Y_test_normalized, y_pred_test_normalized)
acc_val_normalized = accuracy_score(Y_val_normalized, y_pred_val_normalized)
CM_normalized = confusion_matrix(Y_test_normalized, y_pred_test_normalized)

print("\n\n----- NORMALIZED DATA -----")
print("\nAccuracies(train/test/validation):")
print(acc_train_normalized)
print(acc_test_normalized)
print(acc_val_normalized)

print("\nConfusion Matrix:")
print(CM_normalized)

# Cross validation
scores = cross_validate(clf, X_train_normalized,Y_train_normalized, cv=5, scoring=['accuracy','f1','recall','precision'])
print("\nCross validation:")
print("CV: ", 5)
print("Accuracy: ", statistics.mean(scores['test_accuracy']))
print("F1: ", statistics.mean(scores['test_f1']))
print("Recall: ", statistics.mean(scores['test_recall']))
print("Precision: ", statistics.mean(scores['test_precision']))

objects = ('True', 'False')
y_pos = np.arange(len(objects))
performance = [10000,8,6,4,2,1]


data.head()

Classifier type:  Decision Tree 

----- NOT NORMALIZED DATA -----

Accuracies(train/test/validation):
0.9460227272727273
0.9437537004144464
0.947534516765286

Confusion Matrix:
[[1559    8]
 [  87   35]]

Cross validation:
CV:  5
Accuracy:  0.9379741439748731
F1:  0.46514656974475777
Recall:  0.3746031746031746
Precision:  0.6734434084749339


----- NORMALIZED DATA -----

Accuracies(train/test/validation):
0.9360795454545454
0.8235642391947898
0.8177514792899409

Confusion Matrix:
[[1325  229]
 [  69   66]]

Cross validation:
CV:  5
Accuracy:  0.9332370509548782
F1:  0.44667802135446655
Recall:  0.37072261072261076
Precision:  0.6544173218086261


Unnamed: 0,Date,Rented_Bike_Count,Hour,Temperature,Humidity,Wind_speed,Visibility,Seasons,Holiday,Precipitation
0,1512086400000000000,254,0,-5.2,37,2.2,2000,4,0,False
1,1512086400000000000,204,1,-5.5,38,0.8,2000,4,0,False
2,1512086400000000000,173,2,-6.0,39,1.0,2000,4,0,False
3,1512086400000000000,107,3,-6.2,40,0.9,2000,4,0,False
4,1512086400000000000,78,4,-6.0,36,2.3,2000,4,0,False
