# Part 01 - EDA with Pyspark

Gradient Boosted Trees applied to Fraud detection

#### Pyspark libraries

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, countDistinct
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, array, lit
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.functions import pow, col
import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import col, countDistinct

#### Python libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
spark = SparkSession.builder.appName('FraudTreeMethods').getOrCreate()

## Read Data

In [None]:
# inserting the parent directory into current path
sys.path.insert(1, '../work/data_set')

In [None]:
data_name = 'train_sample.csv'
dataset_address = '../work/data_set/'
path = dataset_address + data_name
RDD = spark.read.csv(path, inferSchema=True, header=True)

In [6]:
RDD.show(5)

+------+---+------+---+-------+-------------------+---------------+-------------+
|    ip|app|device| os|channel|         click_time|attributed_time|is_attributed|
+------+---+------+---+-------+-------------------+---------------+-------------+
| 87540| 12|     1| 13|    497|2017-11-07 09:30:38|           null|            0|
|105560| 25|     1| 17|    259|2017-11-07 13:40:27|           null|            0|
|101424| 12|     1| 19|    212|2017-11-07 18:05:24|           null|            0|
| 94584| 13|     1| 13|    477|2017-11-07 04:58:08|           null|            0|
| 68413| 12|     1|  1|    178|2017-11-09 09:00:09|           null|            0|
+------+---+------+---+-------+-------------------+---------------+-------------+
only showing top 5 rows



In [None]:
print('RDD.printSchema is \n')
RDD.printSchema()

## Convert the click time to day and hour and add it to data.

In [None]:
from pyspark.sql.functions import hour, minute, dayofmonth
RDD = RDD.withColumn('hour',hour(RDD.click_time)).\
             withColumn('day',dayofmonth(RDD.click_time))

RDD.show(5)

## Feathering

Feathering, grouping-merging as follow.

In python EDA we did following:
```python
gp = df[['ip','day','hour','channel']]\
    .groupby(by=['ip','day','hour'])[['channel']]\
    .count().reset_index()\
    .rename(index=str, columns={'channel': '*ip_day_hour_count_channel'})
df = df.merge(gp, on=['ip','day','hour'], how='left')

```
We translate it to Pyspark as follow.

In [None]:
gp = RDD.select("ip","day","hour", "channel")\
               .groupBy("ip","day","hour")\
               .agg({"channel":"count"})\
               .withColumnRenamed("count(channel)", "*ip_day_hour_count_channel")\
               .sort(col("ip"))
RDD = RDD.join(gp, ["ip","day","hour"])\
         .sort(col("ip"))

In [None]:
print("RDD Columns name = \n", RDD.columns)

In python EDA we did following:
```python
gp = df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].\
            count().reset_index().\
            rename(index=str, columns={'channel': '*ip_app_count_channel'})
df = df.merge(gp, on=['ip','app'], how='left')

```
We translate it to Pyspark as follow.

In [None]:
gp = RDD.select("ip","app", "channel")\
               .groupBy("ip","app")\
               .agg({"channel":"count"})\
               .withColumnRenamed("count(channel)", "*ip_app_count_channel")\
               .sort(col("ip"))
RDD = RDD.join(gp, ["ip","app"])\
         .sort(col("ip"))

In [None]:
print("RDD Columns name = \n", RDD.columns)

In python EDA we did following:
```python
gp = df[['ip','app', 'os', 'channel']].\
            groupby(by=['ip', 'app', 'os'])[['channel']].\
            count().reset_index().\
            rename(index=str, columns={'channel': '*ip_app_os_count_channel'})
df = df.merge(gp, on=['ip','app', 'os'], how='left')

```
We translate it to Pyspark as follow.

In [None]:
gp = RDD.select('ip','app', 'os', 'channel')\
               .groupBy('ip', 'app', 'os')\
               .agg({"channel":"count"})\
               .withColumnRenamed("count(channel)", "*ip_app_os_count_channel")\
               .sort(col("ip"))
RDD = RDD.join(gp, ['ip','app', 'os'])\
         .sort(col("ip"))

In [None]:
print("RDD Columns name = \n", RDD.columns)

In python EDA we did following:
```python
gp = df[['ip','day','hour','channel']].\
            groupby(by=['ip','day','channel'])[['hour']].\
            var().reset_index().\
            rename(index=str, columns={'hour': '*ip_day_chan_var_hour'})
df = df.merge(gp, on=['ip','day','channel'], how='left')

```
We translate it to Pyspark as follow.

In [None]:
gp = RDD.select('ip','day','hour','channel')\
               .groupBy('ip','day','channel')\
               .agg({"hour":"variance"})\
               .withColumnRenamed("variance(hour)", "*ip_day_chan_var_hour")\
               .sort(col("ip"))

Check out the number of nan and null in the gp.

In [None]:
gp.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in gp.columns]).show()

We remeber from python EDA the following 

```python
ip                                0
app                               0
device                            0
os                                0
channel                           0
click_time                        0
is_attributed                     0
hour                              0
day                               0
*ip_day_hour_count_channel        0
*ip_app_count_channel             0
*ip_app_os_count_channel          0
*ip_tchan_count               89123
*ip_app_os_var                89715
*ip_app_channel_var_day       84834
*ip_app_channel_mean_hour         0
dtype: int64

```
Therefore we skip the following grouping (columns)as follow.

```python
*ip_tchan_count               10877 non-null float64
*ip_app_os_var                10285 non-null float64
*ip_app_channel_var_day       15166 non-null float64

```
Note that the last gp was not joined into the data.

**Let's Keep going:**

In python EDA we did following:
```python
gp = df[['ip','app', 'channel','hour']].\
            groupby(by=['ip', 'app', 'channel'])[['hour']].\
            mean().reset_index().\
            rename(index=str, columns={'hour': '*ip_app_channel_mean_hour'})

df = df.merge(gp, on=['ip','app', 'channel'], how='left')

```
We translate it to Pyspark as follow.

In [None]:
gp = RDD.select('ip','app', 'channel','hour')\
               .groupBy('ip', 'app', 'channel')\
               .agg({"hour":"mean"})\
               .withColumnRenamed("avg(hour)", "*ip_app_channel_mean_hour")\
               .sort(col("ip"))
RDD = RDD.join(gp, ['ip', 'app', 'channel'])\
         .sort(col("ip"))

In [None]:
print("RDD Columns name = \n", RDD.columns)

In [None]:
RDD.show(5)

### Get summary

In [None]:
# data.summary().show()
cols1 = ['ip', 'app', 'channel',
       'os', 'day', 'hour']
RDD.describe(cols1).show()

In [None]:
cols2 = ['device', 'click_time', 
        'attributed_time','is_attributed']
RDD.describe(cols2).show()

In [None]:
cols3 = ['*ip_day_hour_count_channel',
       '*ip_app_count_channel',
       '*ip_app_os_count_channel']
RDD.describe(cols3).show()

Check out the uniques number for each column in data.

In [None]:
cols4 = cols1 + cols2
RDD.agg(*(countDistinct(col(c)).alias(c) for c in cols4)).show()

In [None]:
RDD.agg(*(countDistinct(col(c)).alias(c) for c in cols3)).show()

## Over sampling the data

* Over sampling
* Duplicate the minority rows
* Combine both oversampled minority rows and previous majority rows

In [None]:
# over sampling
major_df = RDD.filter(col("is_attributed") == 0)
minor_df = RDD.filter(col("is_attributed") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))
a = range(ratio)

In [None]:
# duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')

In [None]:
# combine both oversampled minority rows and previous majority rows combined_df = major_df.unionAll(oversampled_df)
RDD = major_df.unionAll(oversampled_df)

In [None]:
print("RDD Columns name = \n", RDD.columns)

### Turn RDD to pandas and use pandas ability for visualization

* First take a sample from big RDD
* Pass the sample into the pandas data frame

In [None]:
sub_RDD = RDD.sample(False, 0.01, 42)
data_pd = sub_RDD.toPandas()

In [None]:
data_pd.hist(bins=50, 
             figsize=(20,15),
             facecolor='green')
plt.show()

In [None]:
data_pd.plot(kind="scatter", 
             x="app", 
             y="channel", 
             alpha=0.1, 
             figsize=(8,5))

In [None]:
plt.figure(figsize=(20,24))

cols = ['app','device','os', 
        'channel', 'hour', 'day',
        '*ip_day_hour_count_channel', '*ip_app_count_channel',
        '*ip_app_os_count_channel', '*ip_app_channel_mean_hour']

sub_attributed_mask = data_pd["is_attributed"] == 1
sub_Not_attributed_mask = data_pd["is_attributed"] == 0


for count, col in enumerate(cols, 1):
    
    plt.subplot(4, 3, count)
    plt.hist([data_pd[sub_attributed_mask][col], 
          data_pd[sub_Not_attributed_mask][col]],
          color=['goldenrod', 'grey'],
          bins=20, ec='k', density=True)
    
    plt.title('Count distribution by {}'.format(col), fontsize=12)
    plt.legend(['attributed', 'Not_attributed'])
    plt.xlabel(col); plt.ylabel('density')

# path = '../Figures/'
# file_name = 'hist_dens_by_par.png'
# plt.savefig(path+file_name)

### Transfering

Applying the transfering achieved from previous EDA.

In [None]:
trans_colmns = ['app','device','os', 'day', 
                '*ip_day_hour_count_channel', 
                '*ip_app_count_channel', 
                '*ip_app_os_count_channel']

In [None]:
def transformer(x):
    x = pow(x, (0.05))
    return x

Apply the defined function into each column as follow

In [None]:
RDD = RDD.withColumn("app", transformer('app'))
RDD = RDD.withColumn("device", transformer('device'))
RDD = RDD.withColumn("os", transformer('os'))
RDD = RDD.withColumn("day", transformer('day'))

RDD = RDD.withColumn("*ip_day_hour_count_channel", transformer('*ip_day_hour_count_channel'))

RDD = RDD.withColumn("*ip_app_count_channel", transformer('*ip_app_count_channel'))

RDD = RDD.withColumn("*ip_app_os_count_channel", transformer('*ip_app_os_count_channel'))
RDD.show()

In [None]:
RDD.columns

Drop the click time and attributed time

In [None]:
RDD = RDD.drop('click_time','attributed_time')

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = RDD.randomSplit([0.7, 0.3])

cols = ['ip',
 'app',
 'channel',
 'os',
 'day',
 'hour',
 'device',
 'is_attributed',
 '*ip_day_hour_count_channel',
 '*ip_app_count_channel',
 '*ip_app_os_count_channel',
 '*ip_app_channel_mean_hour']

assembler = VectorAssembler(inputCols = cols,outputCol="features")
trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)

## Train the model

In [None]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="is_attributed", featuresCol="features", maxIter=20, maxDepth=4)

# Train model.  This also runs the indexers.
model = gbt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "is_attributed", "features").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="is_attributed", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test accuracy = %g" % (accuracy))

In [None]:
predictions.groupBy('prediction').count().show()

# Apply to test, predict

In [None]:
data_name = 'test.csv'
dataset_address = '../work/data_set/'
path = dataset_address + data_name
test = spark.read.csv(path, inferSchema=True, header=True)
test.show(5)

Compare the train data schema with the test make sure about dimensions.


```python
RDD.printSchema is 

root
 |-- ip: integer (nullable = true)
 |-- app: integer (nullable = true)
 |-- device: integer (nullable = true)
 |-- os: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- click_time: string (nullable = true)
 |-- attributed_time: string (nullable = true)
 |-- is_attributed: integer (nullable = true)

```

In [None]:
print('test.printSchema is \n')
test.printSchema()

In [None]:
from pyspark.sql.functions import hour, minute, dayofmonth
test = test.withColumn('hour',hour(test.click_time)).\
             withColumn('day',dayofmonth(test.click_time))

test.show(5)

Apply feathering to test

In [None]:
gp = test.select("ip","day","hour", "channel")\
               .groupBy("ip","day","hour")\
               .agg({"channel":"count"})\
               .withColumnRenamed("count(channel)", "*ip_day_hour_count_channel")

test = test.join(gp, ["ip","day","hour"])

In [None]:
gp = test.select("ip","app", "channel")\
               .groupBy("ip","app")\
               .agg({"channel":"count"})\
               .withColumnRenamed("count(channel)", "*ip_app_count_channel")

test = test.join(gp, ["ip","app"])

In [None]:
gp = test.select('ip','app', 'os', 'channel')\
               .groupBy('ip', 'app', 'os')\
               .agg({"channel":"count"})\
               .withColumnRenamed("count(channel)", "*ip_app_os_count_channel")

test = test.join(gp, ['ip','app', 'os'])

In [None]:
gp = test.select('ip','app', 'channel','hour')\
               .groupBy('ip', 'app', 'channel')\
               .agg({"hour":"mean"})\
               .withColumnRenamed("avg(hour)", "*ip_app_channel_mean_hour")

test = test.join(gp, ['ip', 'app', 'channel'])

In [None]:
test.show(5)

In [None]:
test = test.withColumn("app", transformer('app'))
test = test.withColumn("device", transformer('device'))
test = test.withColumn("os", transformer('os'))
test = test.withColumn("day", transformer('day'))

test = test.withColumn("*ip_day_hour_count_channel", transformer('*ip_day_hour_count_channel'))

test = test.withColumn("*ip_app_count_channel", transformer('*ip_app_count_channel'))

test = test.withColumn("*ip_app_os_count_channel", transformer('*ip_app_os_count_channel'))

In [None]:
test.show(5)

In [None]:
assembler = VectorAssembler(inputCols = cols,outputCol="features")
test = assembler.transform(test)
#test.show(3)

predictions = model.transform(test)
#predictions.show(2)

data_to_submit = predictions.select(['click_id','prediction'])
data_to_submit.show(3)

data_to_submit = data_to_submit.withColumnRenamed('prediction','is_attributed')
data_to_submit.show(3)

data_to_submit.groupBy('is_attributed').count().show()