In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

spark = SparkSession \
    .builder \
    .appName("jupyter") \
    .master("local") \
    .config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.11:2.4.1' ) \
    .config("spark.mongodb.input.uri", "mongodb://root:example@mongo/test.coll?authSource=admin") \
    .config("spark.mongodb.output.uri", "mongodb://root:example@mongo/test.coll?authSource=admin") \
    .getOrCreate()

spark

In [2]:
%matplotlib inline

import ast
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

    
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# apply style to all the charts
sns.set_style('whitegrid')

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 24)

In [3]:
X = pd.read_csv('final_part_1.csv')

In [4]:

X.head(5)

Unnamed: 0,date,count,day,night,DI,LU,MA,ME,JE,VE,SA,1,...,5,6,7,8,9,10,11,12,11-12,13-14-15,16-17-18,19
0,2012-01-01-d,7,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0
1,2012-01-01-n,32,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0
2,2012-01-02-d,12,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0
3,2012-01-02-n,12,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0
4,2012-01-03-d,24,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0


In [5]:
del X['date']
# del X['19']

y = X['count']
del X['count']


X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
X.head(5)

Unnamed: 0,day,night,DI,LU,MA,ME,JE,VE,SA,1,2,3,...,5,6,7,8,9,10,11,12,11-12,13-14-15,16-17-18,19
0,1,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0
1,0,1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0
2,1,0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0
3,0,1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0
4,1,0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0


In [6]:
list_columns=[]
for i in X.columns:
    list_columns.append(i)
print(len(list_columns))

25


In [7]:
scores = cross_val_score(RandomForestRegressor(500), X, y, cv=10)
print('cross_val_score', np.mean(scores))

cross_val_score 0.6002623473356052


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print('X_train: ', X_train.shape, 'y_train:', y_train.shape, '\nX_test: ', X_test.shape, 'y_test: ', y_test.shape)

model = RandomForestRegressor(500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

X_train:  (1005, 25) y_train: (1005,) 
X_test:  (495, 25) y_test:  (495,)


In [9]:
fi = []
for i, col in enumerate(X_test.columns):
    fi.append([col, model.feature_importances_[i]])
features = pd.DataFrame(fi).sort_values(1, ascending=False)
print(type(features))
print(features)
useless = list(features.loc[features[1] < 0.0003, 0])
print(useless)

<class 'pandas.core.frame.DataFrame'>
           0     1
1      night 0.305
0        day 0.278
2         DI 0.082
8         SA 0.073
23  16-17-18 0.042
9          1 0.029
20        12 0.022
7         VE 0.021
19        11 0.017
4         MA 0.014
21     11-12 0.013
17         9 0.012
3         LU 0.012
14         6 0.012
15         7 0.009
13         5 0.009
11         3 0.008
6         JE 0.008
18        10 0.008
5         ME 0.007
10         2 0.006
16         8 0.005
22  13-14-15 0.004
12         4 0.004
24        19 0.000
['19']


In [13]:
df_features = spark.createDataFrame(features)
df_features.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

In [4]:
spark.stop()