<a href="https://colab.research.google.com/github/BartekMasiak/Python-in-ML-DS-Seul-Bike-Sharing-Dateset/blob/main/test_TPR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# importing libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import statistics

from sklearn import svm, tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

"""
Part 1. -- Importing data and preparing it for further processing.
"""
TPR_normalized = []
TPR_not_normalized = []

for i in range(100):
  # importing dataset
  DATAFILE='/content/drive/MyDrive/SeoulBikeData.csv'
  data = pd.read_csv(DATAFILE, encoding='unicode_escape')


  # changing date format
  data.Date = pd.to_datetime( data.Date, format="%d/%m/%Y")
  data.head()

  # cleaning data
  def cleanse(x):
      return re.sub(r'\(.*$','',x).strip().replace(' ','_')
  data.columns = data.columns.to_series().apply(cleanse)


  # deleting dew point & solar radiation
  data = data.drop(columns='Dew_point_temperature')
  data = data.drop(columns='Solar_Radiation')


  # changing categorical data into numerical
  data['Holiday'].replace(['Holiday', 'No Holiday'], [1, 0], inplace=True)
  data['Seasons'].replace(['Spring', 'Summer', 'Autumn', 'Winter'], [1, 2, 3, 4], inplace=True)

  # deleting duplicates
  data = data.drop_duplicates()

  # changing date format to numerical
  data['Date'] = pd.to_numeric(pd.to_datetime(data['Date']))


  # deleting non functioning days
  non_functioning_days = data.query('Functioning_Day=="No"').Date.unique()
  data = data[ ~data.Date.isin(non_functioning_days)]
  data.drop(columns='Functioning_Day', inplace=True)


  # merging Rainfall and Snowfall into one feature - Precipitation
  data['Precipitation'] = data['Rainfall'] + data['Snowfall']
  data['Precipitation'].copy()
  data = data.drop(columns=['Rainfall','Snowfall'])

  """
  Making Precipitation binary. We assume that if Precipitation is higher than 1 we
  consider a day to be rainy/snowy.
  """
  data['Precipitation'] = data['Precipitation'] > 1


  # comment or uncomment line belowe whether you need data without or with Rentet Bike Count
  #data = data.drop(columns='Rented_Bike_Count')


  # normalization
  minmax_scaler = MinMaxScaler().set_output(transform="pandas")
  data_normalized = minmax_scaler.fit_transform(data)

  # splitting data not normalized
  train_data, test_data = train_test_split(data, test_size=0.5)
  test_data, val_data = train_test_split(train_data, test_size=0.6)

  # splitting data normalized
  train_data_normalized, test_data_normalized = train_test_split(data_normalized, test_size=0.5)
  test_data_normalized, val_data_normalized = train_test_split(train_data, test_size=0.6)

  """
  Part 2. -- Buliding model & outcomes assesment.
  """

  # Choosing classifier. (uncomment needed)


  #clf = svm.SVC(kernel='rbf')
  #clf_name = 'SVM'

  #clf = MLPClassifier()
  #clf_name = 'MLP'

  clf = KNeighborsClassifier(n_neighbors=3)
  clf_name = 'K Neighbors'

  #clf = tree.DecisionTreeClassifier(max_depth=4)
  #clf_name = 'Decision Tree'


  """ Implementing classifier for not normalized data."""
  # Dividing data:
  X_train = train_data.drop('Precipitation', axis = 1)
  Y_train = train_data['Precipitation']

  X_test = test_data.drop('Precipitation', axis = 1)
  Y_test = test_data['Precipitation']

  X_val = val_data.drop('Precipitation', axis = 1)
  Y_val = val_data['Precipitation']

  # Building classifier & calculating predicted values:
  clf.fit(X_train, Y_train)
  y_pred_train = clf.predict(X_train)
  y_pred_test = clf.predict(X_test)
  y_pred_val = clf.predict(X_val)

  # Calculating confusion matrix:
  CM = confusion_matrix(Y_test, y_pred_test)

  TPR_not_normalized.append(CM[1][1]/(CM[1][1]+CM[1][0]))




  """ Implementing classifier for normalized data."""
  # Dividing data:
  X_train_normalized = train_data_normalized.drop('Precipitation', axis = 1)
  Y_train_normalized = train_data_normalized['Precipitation']

  X_test_normalized = test_data_normalized.drop('Precipitation', axis = 1)
  Y_test_normalized = test_data_normalized['Precipitation']

  X_val_normalized = val_data_normalized.drop('Precipitation', axis = 1)
  Y_val_normalized = val_data_normalized['Precipitation']

  # Building classifier & calculating predicted values:
  clf.fit(X_train_normalized, Y_train_normalized)
  y_pred_train_normalized = clf.predict(X_train_normalized)
  y_pred_test_normalized = clf.predict(X_test_normalized)
  y_pred_val_normalized = clf.predict(X_val_normalized)

  # Calculating confusion matrix:
  CM_normalized = confusion_matrix(Y_test_normalized, y_pred_test_normalized)
  TPR_normalized.append(CM_normalized[1][1]/(CM_normalized[1][1]+CM_normalized[1][0]))

print("----- NOT NORMALIZED ------")
print("TPR MEAN:", statistics.mean(TPR_not_normalized))

print("----- NORMALIZED ------")
print("TPR MEAN:", statistics.mean(TPR_normalized))






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns='Functioning_Day', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Precipitation'] = data['Rainfall'] + data['Snowfall']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns='Functioning_Day', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentat

----- NOT NORMALIZED ------
TPR MEAN: 0.7718600623371461
----- NORMALIZED ------
TPR MEAN: 0.0
