In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import datetime as dt
import os
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn import metrics

%matplotlib inline

In [2]:
# from google.colab import files
# uploaded = files.upload()

In [4]:
bikes = pd.read_csv('Data/train.csv')

In [5]:
bikes.head()

Unnamed: 0,ID,Date,y,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [6]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5760 entries, 0 to 5759
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         5760 non-null   int64  
 1   Date                       5760 non-null   object 
 2   y                          5760 non-null   int64  
 3   Hour                       5760 non-null   int64  
 4   Temperature(�C)            5760 non-null   float64
 5   Humidity(%)                5760 non-null   int64  
 6   Wind speed (m/s)           5760 non-null   float64
 7   Visibility (10m)           5760 non-null   int64  
 8   Dew point temperature(�C)  5760 non-null   float64
 9   Solar Radiation (MJ/m2)    5760 non-null   float64
 10  Rainfall(mm)               5760 non-null   float64
 11  Snowfall (cm)              5760 non-null   float64
 12  Seasons                    5760 non-null   object 
 13  Holiday                    5760 non-null   objec

In [7]:
def change_col_name(df, col_names_dic):
    df.rename(columns=col_names_dic , inplace=True)

col_names_dic = {'Temperature(�C)':'Temperature(C)',
                 'Dew point temperature(�C)':'Dew point temperature(C)'}
change_col_name(bikes,col_names_dic)

In [8]:
bikes['Date'] = pd.to_datetime(bikes['Date'])
bikes['Date']=bikes['Date'].map(dt.datetime.toordinal)

In [9]:
#Encoding string data 

le = LabelEncoder()

def encode_columns(dataframe, lst_of_col):
    for col in lst_of_col:
        dataframe[col] = le.fit_transform(dataframe[col].values)

lst_of_col = ['Seasons',
              'Holiday', 
              'Functioning Day' ]

encode_columns(bikes, lst_of_col)

In [10]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5760 entries, 0 to 5759
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        5760 non-null   int64  
 1   Date                      5760 non-null   int64  
 2   y                         5760 non-null   int64  
 3   Hour                      5760 non-null   int64  
 4   Temperature(C)            5760 non-null   float64
 5   Humidity(%)               5760 non-null   int64  
 6   Wind speed (m/s)          5760 non-null   float64
 7   Visibility (10m)          5760 non-null   int64  
 8   Dew point temperature(C)  5760 non-null   float64
 9   Solar Radiation (MJ/m2)   5760 non-null   float64
 10  Rainfall(mm)              5760 non-null   float64
 11  Snowfall (cm)             5760 non-null   float64
 12  Seasons                   5760 non-null   int32  
 13  Holiday                   5760 non-null   int32  
 14  Function

In [11]:
bikes.head()

Unnamed: 0,ID,Date,y,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,736341,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,3,1,1
1,1,736341,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,3,1,1
2,2,736341,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,3,1,1
3,3,736341,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,3,1,1
4,4,736341,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,3,1,1


In [12]:
bikes.columns

Index(['ID', 'Date', 'y', 'Hour', 'Temperature(C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

### **Building Decision Tree Model**

In [13]:
feature_cols = ['ID', 'Date', 'Hour', 'Temperature(C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day']
X = bikes[feature_cols]
y = bikes['y']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test


In [15]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [22]:
#RMSLE = np.sqrt(1/len(######y#######)* np.sum(np.log(y_pred+1)-np.log(y_test+1))**2)

RMSLE = np.sqrt(1/len(y_test)* np.sum(np.log(y_pred+1)-np.log(y_test+1))**2)
print(RMSLE)

0.28149232360533183


In [17]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.05150462962962963


In [18]:
import sklearn.metrics

In [21]:
msle = sklearn.metrics.mean_squared_log_error(y_test, y_pred)
rmsle = np.sqrt(msle)
print("RMSLE = ",rmsle)

RMSLE =  0.6615557960360181


# submission file

In [23]:
# df_test = pd.read_csv("Data/test.csv") 

# #########################################################
# #   لازم تعملى encode للكولمز زى ما انتى عاملاها فى ال train 
# ###############################################df_test = encode_columns(df_test)




# X_test = df_test[features_selected_test]

# y_test_predicted = model.predict(X_test)

# df_test['y'] = np.round(y_test_predicted)

# df_test[['ID','y']]