In [4]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler,normalize
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,KFold,cross_val_score,RepeatedKFold
from sklearn.metrics import r2_score,accuracy_score,mean_absolute_error,mean_squared_error

In [2]:
def datapreprocess(df):
    df['day']=df['datetime'].apply(lambda x:datetime.strptime(x,'%m/%d/%Y %H:%M').day)
    df['month']=df['datetime'].apply(lambda x:datetime.strptime(x,'%m/%d/%Y %H:%M').month)
    df['year']=df['datetime'].apply(lambda x:datetime.strptime(x,'%m/%d/%Y %H:%M').year)
    df['hour']=df['datetime'].apply(lambda x:datetime.strptime(x,'%m/%d/%Y %H:%M').hour)
    df.drop('datetime',axis=1,inplace=True)
    for col in ['season', 'holiday', 'workingday', 'month','hour','weather','day']:
        df[col] = df[col].astype('category')
    weather_map={'Clear + Few clouds':'CF','Light Snow, Light Rain':'LSLR','Mist + Cloudy':'MC','Heavy Rain + Thunderstorm':'HRT'}
    df['weather'] = df['weather'].apply(lambda x : weather_map.get(x.strip()))
    

In [5]:
def catgoricalencoding(df):
    cols_for_label_encoding=['season','weather','year','month','hour','day']
    dataset = pd.get_dummies(columns=cols_for_label_encoding, data=df)
    return dataset

In [23]:
def get_outlier(df):
    features = df.columns
    outliers  = []
    for i, feature in enumerate(features):
        if ( (df[feature].dtype == 'float64')):
            # Calculate Q1 (25th percentile of the data) for the given feature
            Q1 = np.percentile(df[feature], 25)
            # Calculate Q3 (75th percentile of the data) for the given feature
            Q3 = np.percentile(df[feature], 75)
            # Use the interquartile range to calculate an outlier step
            step = 1.5 * (Q3 - Q1)
            feature_outliers = df[~((df[feature] >= Q1 - step) & (df[feature] <= Q3 + step))]
            outliers.extend(list(feature_outliers.index.values))
            print('Feature Name: {}, No. of outliers: {}\n'.format(feature, len(feature_outliers.index)))
    
    multi_feature_outliers = (Counter(outliers) - Counter(set(outliers))).keys()
    #print(outliers)
    return outliers

In [24]:
train=pd.read_csv('train.csv')
datapreprocess(train)
train_laebl=pd.read_csv('train_label.csv',header=None)
train_laebl.rename(columns={0:'Total_bookings'},inplace=True)
train['Total_bookings']=train_laebl['Total_bookings']


In [25]:
from collections import Counter
multi_feature_outliers = get_outlier(train)

Feature Name: temp, No. of outliers: 0

Feature Name: atemp, No. of outliers: 0

Feature Name: windspeed, No. of outliers: 182



In [26]:
train=train.drop(train.index[list(multi_feature_outliers)]).reset_index(drop=True)

In [27]:
dataset=catgoricalencoding(train)

Y=dataset['Total_bookings']

dataset.drop(['atemp','weather_HRT','Total_bookings'],axis=1,inplace=True)

In [28]:
def scaleData(dataset):
    standardScaler = StandardScaler()
    scaled_data = standardScaler.fit_transform(dataset[['humidity','temp','windspeed']])
    scaled_data_df=pd.DataFrame(scaled_data,columns=['humidity','temp','windspeed'])
    dataset[['humidity','temp','windspeed']] = scaled_data_df[['humidity','temp','windspeed']]
    return dataset
dataseta=scaleData(dataset)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [29]:
X_train, X_test,y_train, y_test = train_test_split(dataset,Y,test_size=0.3,random_state=2)

In [30]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0) 

regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
regressor.score(X_train,y_train)

0.9867191556470022

In [32]:
y_predict=regressor.predict(X_test)

In [33]:
r2_score(y_test,y_predict) , mean_absolute_error(y_test,y_predict)

(0.9047498575876904, 35.8492337763878)

CHECK ON TEST DATA

In [37]:
test=pd.read_csv('test.csv')
test_label=pd.read_csv('test_label.csv',header=None)
datapreprocess(test)
test_data=catgoricalencoding(test)
test_data.drop(['atemp'],axis=1,inplace=True)
test_data=scaleData(test_data)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [38]:
bookng_predict_test=regressor.predict(test_data)

In [39]:
r2_score(test_label,bookng_predict_test) , mean_absolute_error(test_label,bookng_predict_test)

(0.9120482605935244, 35.13947658402204)