### **Mise En Garde: Certaines méthodes sont exploitables en fonction de la version de Python que vous ustilisez**

In [None]:
!pip install pyspark

In [2]:
import pyspark
print(f"{pyspark.__version__}")

3.5.0


In [3]:
import os
os.listdir(os.getcwd())

['.config', 'census_income.csv', 'sample_data']

### **Point d'entrée : SparkContext**

In [4]:
sc = pyspark.SparkContext()

# Verifier si SparkContext est valide
print(sc)

<SparkContext master=local[*] appName=pyspark-shell>


In [5]:
# Check the version of SparkContext in PySpark Shell
print("La version de Spark Context dans PySpark Shell est : ", sc.version)

# Display the Python version of SparkContext
print("La version Python de Spark Context dans PySpark Shell est : ", sc.pythonVer)

# Display the master of SparkContext
print("Le maitre de Spark Context dans PySpark Shell est : ", sc.master)

La version de Spark Context dans PySpark Shell est :  3.5.0
La version Python de Spark Context dans PySpark Shell est :  3.10
Le maitre de Spark Context dans PySpark Shell est :  local[*]


### **Loading Data**

In [6]:
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

In [7]:
census_path = 'census_income.csv'

census = spark.read.csv(census_path, header=True, inferSchema=True)

In [8]:
import pyspark.sql.functions as func

for col, typ in census.dtypes:
  if typ == 'string':
    census = census.withColumn(
        col,
        func.ltrim(func.rtrim(census[col]))
    )
census.count()

32561

In [9]:
census.show()

+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+-----+
|age|       workclass|fnlwgt|   education|education-num|      marital-status|       occupation| relationship|              race|   sex|capital-gain|capital-loss|hours-per-week|native-country|label|
+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+-----+
| 39|       State-gov| 77516|   Bachelors|           13|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|        2174|           0|            40| United-States|<=50K|
| 50|Self-emp-not-inc| 83311|   Bachelors|           13|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|           0|           0|            13| United-States|<=50K|
| 38|     

In [10]:
census.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- label: string (nullable = true)



## **Exploring Data**

### **Data prep**



+ **List of columns to keep**

In [11]:
cols_to_keeps = census.dtypes

cols_to_keeps = (
    ['label', 'age'
    ,'capital-gain'
    ,'capital-loss'
    ,'hours-per-week'
    ] + [
        e[0] for e in cols_to_keeps[:-1]
        if e[1] == 'string'
    ]
)

cols_to_keeps

['label',
 'age',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

+ **Get numeric and categorical columns**

In [12]:
import pyspark.mllib.stat as st
import numpy as np

census_subset = census.select(cols_to_keeps)

cols_num = [
    e[0] for e in census_subset.dtypes
    if e[1] == 'int'
]

cols_cat = [
    e[0] for e in census_subset.dtypes[1:]
    if e[1] == 'string'
]

cols_num, cols_cat

(['age', 'capital-gain', 'capital-loss', 'hours-per-week'],
 ['workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country'])

### **Numerical Data**

In [13]:
rdd_num = (
    census_subset
    .select(cols_num)
    .rdd
    .map(lambda row: [e for e in row])
)

stats_num = st.Statistics.colStats(rdd_num)

In [14]:
for col, min_, mean_, max_, var_ in zip(
    cols_num,
    stats_num.min(),
    stats_num.max(),
    stats_num.max(),
    stats_num.variance()
):
  print('{0}: min -> {1:.1f}, mean -> {2:.1f}, max -> {3:.1f}, stdev -> {4:.1f}'
        .format(col, min_, mean_, max_, np.sqrt(var_)))

age: min -> 17.0, mean -> 90.0, max -> 90.0, stdev -> 13.6
capital-gain: min -> 0.0, mean -> 99999.0, max -> 99999.0, stdev -> 7385.3
capital-loss: min -> 0.0, mean -> 4356.0, max -> 4356.0, stdev -> 403.0
hours-per-week: min -> 1.0, mean -> 99.0, max -> 99.0, stdev -> 12.3


### **Categorical Data**

In [15]:
rdd_cat = (
    census_subset
    .select(cols_cat + ['label'])
    .rdd
    .map(lambda row: [e for e in row])
)

results_cat = {}

for i, col in enumerate(cols_cat + ['label']):
  results_cat[col]= (
      rdd_cat
      .groupBy(lambda row: row[i])
      .map(lambda e1: (e1[0], len(e1[1])))
      .collect()
  )

for k in results_cat:
  print(
      k,
      sorted(
          results_cat[k],
          key = lambda e1: e1[1],
          reverse = True),
          '\n'
  )

workclass [('Private', 22696), ('Self-emp-not-inc', 2541), ('Local-gov', 2093), ('?', 1836), ('State-gov', 1298), ('Self-emp-inc', 1116), ('Federal-gov', 960), ('Without-pay', 14), ('Never-worked', 7)] 

education [('HS-grad', 10501), ('Some-college', 7291), ('Bachelors', 5355), ('Masters', 1723), ('Assoc-voc', 1382), ('11th', 1175), ('Assoc-acdm', 1067), ('10th', 933), ('7th-8th', 646), ('Prof-school', 576), ('9th', 514), ('12th', 433), ('Doctorate', 413), ('5th-6th', 333), ('1st-4th', 168), ('Preschool', 51)] 

marital-status [('Married-civ-spouse', 14976), ('Never-married', 10683), ('Divorced', 4443), ('Separated', 1025), ('Widowed', 993), ('Married-spouse-absent', 418), ('Married-AF-spouse', 23)] 

occupation [('Prof-specialty', 4140), ('Craft-repair', 4099), ('Exec-managerial', 4066), ('Adm-clerical', 3770), ('Sales', 3650), ('Other-service', 3295), ('Machine-op-inspct', 2002), ('?', 1843), ('Transport-moving', 1597), ('Handlers-cleaners', 1370), ('Farming-fishing', 994), ('Tech-s

### **Correlations**


In [16]:
correlations = st.Statistics.corr(rdd_num)
correlations

array([[ 1.        ,  0.0776745 ,  0.05777454,  0.06875571],
       [ 0.0776745 ,  1.        , -0.03161506,  0.07840862],
       [ 0.05777454, -0.03161506,  1.        ,  0.05425636],
       [ 0.06875571,  0.07840862,  0.05425636,  1.        ]])

In [17]:
for i, e1_i in enumerate(abs(correlations) > 0.05):
  print(cols_num[i])

  for j, e1_j in enumerate(e1_i):
    if e1_j and j != 1 :
      print(
          '   ',
          cols_num[j] + ' : ',
          correlations[i][j]
      )
  print()

age
    age :  1.0
    capital-loss :  0.05777453947896792
    hours-per-week :  0.0687557075095634

capital-gain
    age :  0.077674498166008
    hours-per-week :  0.078408615390123

capital-loss
    age :  0.05777453947896792
    capital-loss :  1.0
    hours-per-week :  0.05425636227262224

hours-per-week
    age :  0.0687557075095634
    capital-loss :  0.05425636227262224
    hours-per-week :  1.0



### **Statistical Testing**

In [18]:
import pyspark.mllib.linalg as ln

census_occupation = (
    census
    .groupBy('occupation')
    .pivot('label')
    .count()
)

census_occupation_coll = (
    census_occupation
    .rdd
    .map(lambda row: row[1:])
    .flatMap(lambda row: row)
    .collect()
)

len_row = len(census_occupation.collect())


dense_mat = ln.DenseMatrix(
    len_row,
    2,
    census_occupation_coll,
    True
)

chi_sq = st.Statistics.chiSqTest(dense_mat)

print(chi_sq.pValue)
print(chi_sq.nullHypothesis)

0.0
the occurrence of the outcomes is statistically independent.


In [19]:
dense_mat.toArray()

array([[2.667e+03, 9.830e+02],
       [2.098e+03, 1.968e+03],
       [2.281e+03, 1.859e+03],
       [1.284e+03, 8.600e+01],
       [8.790e+02, 1.150e+02],
       [3.170e+03, 9.290e+02],
       [1.277e+03, 3.200e+02],
       [1.480e+02, 1.000e+00],
       [4.380e+02, 2.110e+02],
       [3.158e+03, 1.370e+02],
       [6.450e+02, 2.830e+02],
       [1.752e+03, 2.500e+02],
       [8.000e+00, 1.000e+00],
       [1.652e+03, 1.910e+02],
       [3.263e+03, 5.070e+02]])

### **Transforming the Data**

+ Number of distinct values

In [20]:
len_ftrs = []

for col in cols_cat:
  (
      len_ftrs
      .append (
          ( col,
          census
          .select(col)
          .distinct()
          .count()
      )
    )
  )

len_ftrs = dict(len_ftrs)
len_ftrs

{'workclass': 9,
 'education': 16,
 'marital-status': 7,
 'occupation': 15,
 'relationship': 6,
 'race': 5,
 'sex': 2,
 'native-country': 42}

+ **Using hashing trick**

In [21]:
import pyspark.mllib.feature as feat

final_data = (
    census
    .select(cols_to_keeps)
    .rdd
    .map(lambda row: [
        list(
            feat.HashingTF(int(len_ftrs[col] / 2.0))
            .transform(row[i])
            .toArray()
        ) if i >= 5
        else [row[i]]
        for i, col in enumerate(cols_to_keeps)
    ])
)

final_data.take(5)

[[['<=50K'],
  [39],
  [2174],
  [0],
  [40],
  [1.0, 2.0, 1.0, 5.0],
  [3.0, 3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0],
  [2.0, 3.0, 8.0],
  [0.0, 3.0, 3.0, 1.0, 4.0, 1.0, 0.0],
  [5.0, 5.0, 3.0],
  [3.0, 2.0],
  [4.0],
  [1.0,
   0.0,
   0.0,
   3.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   2.0,
   0.0,
   0.0,
   0.0,
   1.0,
   1.0,
   2.0,
   1.0,
   1.0,
   0.0]],
 [['<=50K'],
  [50],
  [0],
  [0],
  [13],
  [4.0, 3.0, 1.0, 8.0],
  [3.0, 3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0],
  [5.0, 5.0, 8.0],
  [0.0, 1.0, 2.0, 2.0, 8.0, 1.0, 1.0],
  [4.0, 2.0, 1.0],
  [3.0, 2.0],
  [4.0],
  [1.0,
   0.0,
   0.0,
   3.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   2.0,
   0.0,
   0.0,
   0.0,
   1.0,
   1.0,
   2.0,
   1.0,
   1.0,
   0.0]],
 [['<=50K'],
  [38],
  [0],
  [0],
  [40],
  [2.0, 2.0, 0.0, 3.0],
  [2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0],
  [3.0, 2.0, 3.0],
  [2.0, 3.0, 1.0, 3.0, 7.0, 0.0, 1.0],
  [5.0, 5.0, 3.0],
  [3.0, 2.0],
  [4.0],
  [1.0,
   0.0,
   0.0,


In [22]:
def labelEncode(label):
  return [int(label[0] == '>50K')]

final_data = (
    final_data
    .map(lambda row: labelEncode(row[0])
        + [
            item
            for sublist in row[1:]
            for item in sublist
        ]
         )
)

### **Standardizing Data**

In [23]:
standardizer = feat.StandardScaler(True, True)
sModel = standardizer.fit(final_data.map(lambda row: row[1:]))
final_data_scaled = sModel.transform(final_data.map(lambda row: row[1:]))

final_data = (
    final_data
    .map(lambda row: row[0])
    .zipWithIndex()
    .map(lambda row: (row[1], row[0]))
    .join(
        final_data_scaled
        .zipWithIndex()
        .map(lambda row: (row[1], row[0]))
    )
    .map(lambda row: row[1])
)

final_data.take(3)

[(0,
  DenseVector([0.0307, 0.1485, -0.2167, -0.0354, -1.2635, 0.008, 1.7796, 1.0001, 0.83, 0.5743, -0.3473, -0.443, 0.6826, -0.4007, -0.3862, -0.4685, -1.1369, -0.4555, 0.4551, -1.1329, 2.0776, 1.8713, -1.0381, -0.3381, -0.2381, -0.775, 0.9805, 1.5207, 0.3083, -0.0634, -0.2574, -0.7031, 0.3208, -0.0901, -0.1263, 0.3355, -0.1223, 0.3378, -0.0853, -0.0937, -0.0887, -0.2104, 0.0, 0.1286, -0.1976, -0.1433, -0.1419, 0.1895, 0.298, 0.2896, 0.1638, 0.1221, 0.0])),
 (0,
  DenseVector([-0.0426, -0.1459, -0.2167, -0.0354, -0.2368, 0.008, -0.5593, -0.2503, -0.2076, -0.1937, -0.3473, -0.443, -0.9076, -0.4007, 1.1933, -0.4685, -0.4505, -1.1655, -2.1192, 0.935, 2.0776, -0.5029, 0.383, 1.0477, -1.306, 1.0328, 0.9805, 1.5207, 0.3083, -0.0634, -0.2574, -0.7031, 0.3208, -0.0901, -0.1263, 0.3355, -0.1223, 0.3378, -0.0853, -0.0937, -0.0887, -0.2104, 0.0, 0.1286, -0.1976, -0.1433, -0.1419, 0.1895, 0.298, 0.2896, 0.1638, 0.1221, 0.0])),
 (0,
  DenseVector([-0.7758, -0.1459, -0.2167, -0.0354, -0.2368, 0.008

In [24]:
final_data.take(1)

[(0,
  DenseVector([0.0307, 0.1485, -0.2167, -0.0354, -1.2635, 0.008, 1.7796, 1.0001, 0.83, 0.5743, -0.3473, -0.443, 0.6826, -0.4007, -0.3862, -0.4685, -1.1369, -0.4555, 0.4551, -1.1329, 2.0776, 1.8713, -1.0381, -0.3381, -0.2381, -0.775, 0.9805, 1.5207, 0.3083, -0.0634, -0.2574, -0.7031, 0.3208, -0.0901, -0.1263, 0.3355, -0.1223, 0.3378, -0.0853, -0.0937, -0.0887, -0.2104, 0.0, 0.1286, -0.1976, -0.1433, -0.1419, 0.1895, 0.298, 0.2896, 0.1638, 0.1221, 0.0]))]

In [25]:
sModel.mean, sModel.std

(DenseVector([38.5816, 1077.6488, 87.3038, 40.4375, 2.2307, 1.9942, 0.2391, 3.4004, 2.2001, 2.2522, 0.1836, 0.2528, 0.5707, 0.1384, 1.2445, 1.5914, 3.6564, 3.6415, 7.1161, 1.0957, 0.838, 1.4236, 2.461, 4.7319, 1.223, 0.4287, 3.8206, 2.8271, 2.472, 3.0508, 2.4882, 4.6616, 0.9066, 0.008, 0.0157, 2.7153, 0.0157, 0.8967, 0.0072, 0.0108, 0.0078, 0.0432, 0.0, 1.9416, 0.0445, 0.0238, 0.0197, 0.9444, 0.9136, 1.8534, 0.9618, 0.964, 0.0]),
 DenseVector([13.6404, 7385.2921, 402.9602, 12.3474, 0.974, 0.722, 0.4276, 1.5994, 0.9637, 1.3021, 0.5285, 0.5706, 0.6288, 0.3453, 0.6331, 1.2623, 1.457, 1.4084, 1.9423, 0.9671, 1.0407, 0.8424, 1.4074, 2.1649, 0.9365, 0.5532, 1.2029, 1.4289, 1.7124, 0.8013, 1.8969, 0.941, 0.2911, 0.0893, 0.1243, 0.8486, 0.1281, 0.3057, 0.0846, 0.1151, 0.088, 0.2055, 0.0, 0.4543, 0.2251, 0.1658, 0.1391, 0.2935, 0.29, 0.5062, 0.2331, 0.2945, 0.0]))

### **Creating an RDD for training**

In [26]:
import pyspark.mllib.regression as reg

final_data_income = (
    final_data
    .map(lambda row: reg.LabeledPoint(
        row[0]
        , row[1]
        )
    )
)
final_data_income.take(2)

[LabeledPoint(0.0, [0.03067008637999638,0.14845061558793698,-0.2166562000280365,-0.035428902921321385,-1.2634955069616076,0.007996473732148601,1.7795863357831183,1.0001208369009675,0.8300118030815685,0.5743180912245708,-0.3473011663518568,-0.44299296056248205,0.6826214082675854,-0.40070785156169997,-0.3862275949058694,-0.46854980553952796,-1.1368735829205445,-0.4555118517327555,0.45507885276723403,-1.1329160643506915,2.0775754968328823,1.871297515327448,-1.0380868217125299,-0.3380809425105878,-0.238127083048188,-0.7750139160707564,0.9804554188514951,1.5207190561281545,0.3083409364172059,-0.06339046129246152,-0.2573678505758229,-0.7030605487269502,0.3207798471547602,-0.09006362818252112,-0.12626697150459487,0.33547741813186394,-0.12230357893453811,0.33781064530670735,-0.085261168002025,-0.09367074945140887,-0.08866697117953624,-0.21041541274675812,0.0,0.12864878811001576,-0.19757081643280736,-0.1433270085322854,-0.14193221558442817,0.18951895277276812,0.2979804796353739,0.28962659119488

In [27]:
mu, std = sModel.mean[3], sModel.std[3]

final_data_hours = (
    final_data
    .map(lambda row: reg.LabeledPoint(
        row[1][3] * std + mu,
        ln.Vectors.dense([row[0]] + list(row[1][0:3]) + list(row[1][4:]))
    )
  )
)

final_data_hours.take(3)

[LabeledPoint(40.0, [0.0,0.03067008637999638,0.14845061558793698,-0.2166562000280365,-1.2634955069616076,0.007996473732148601,1.7795863357831183,1.0001208369009675,0.8300118030815685,0.5743180912245708,-0.3473011663518568,-0.44299296056248205,0.6826214082675854,-0.40070785156169997,-0.3862275949058694,-0.46854980553952796,-1.1368735829205445,-0.4555118517327555,0.45507885276723403,-1.1329160643506915,2.0775754968328823,1.871297515327448,-1.0380868217125299,-0.3380809425105878,-0.238127083048188,-0.7750139160707564,0.9804554188514951,1.5207190561281545,0.3083409364172059,-0.06339046129246152,-0.2573678505758229,-0.7030605487269502,0.3207798471547602,-0.09006362818252112,-0.12626697150459487,0.33547741813186394,-0.12230357893453811,0.33781064530670735,-0.085261168002025,-0.09367074945140887,-0.08866697117953624,-0.21041541274675812,0.0,0.12864878811001576,-0.19757081643280736,-0.1433270085322854,-0.14193221558442817,0.18951895277276812,0.2979804796353739,0.28962659119488887,0.16375112206

### **Spàlitting Into Training and Testing**

In [29]:
(
    final_data_income_train,
    final_data_income_test
) = (
    final_data_income.randomSplit([0.7, 0.3])
)

In [30]:
(
    final_data_hours_train,
    final_data_hours_test
) = (
    final_data_hours.randomSplit([0.7, 0.3])
)

### **Predicting hours of work for census respondents**

+ Linear regression(benchmark)

In [32]:
workhours_model_lm = reg.LinearRegressionWithSGD.train(final_data_hours_train)

In [33]:
small_sample_hours = sc.parallelize(final_data_hours_test.take(10))

for t,p in zip(
    small_sample_hours
      .map(lambda row: row.label)
      .collect(),
    workhours_model_lm.predict(
        small_sample_hours
          .map(lambda row: row.features)
    ).collect()
): print(t,p)

40.0 -17.710351989405652
16.0 0.7884451034867767
40.0 48.143135206619576
35.0 16.361899132112132
40.0 -1.9939925483191634
40.0 8.64595997739316
52.0 12.563907579525969
40.0 8.838883743158187
35.0 10.070621988838832
40.0 -7.355920705470115


In [34]:
workhours_model_lm.weights

DenseVector([53.7726, -2.9522, -3.3197, -1.7322, -0.2149, -1.027, -0.2633, 2.0059, -3.0387, 1.6083, 1.133, -1.2973, -0.4067, 1.4962, 3.2463, 1.4535, -1.4024, -1.4187, -0.9019, 0.6091, -0.5662, -0.1162, -1.8664, 1.3544, 1.7149, -1.3445, 4.0175, -0.2138, 4.0577, -0.4991, 0.6701, -0.379, -0.0973, -0.3107, 0.1058, -0.2289, 0.0126, -0.0957, -0.3685, 0.2505, -0.4255, 0.824, 0.0, -0.3055, 0.75, -0.4783, 0.3906, 0.2477, 0.2761, -0.0296, 0.2436, 0.3394, 0.0])

### **Forecasting income levels of census respondents**

+ Logistic Regression

In [None]:
import pyspark.mllib.classification as cl
income_model_lr = cl.LogisticRegressionWithSGD.train(final_data_income_train)

In [39]:
small_sample_income = sc.parallelize(final_data_income_test.take(10))

for t,p in zip(
    small_sample_income
      .map(lambda row: row.label)
      .collect(),
    income_model_lr.predict(
        small_sample_income
          .map(lambda row: row.features)
    ).collect()
): print(t,p)

0.0 0
0.0 0
1.0 1
0.0 1
1.0 1
0.0 1
0.0 0
0.0 1
0.0 1
0.0 1


In [40]:
income_model_lr.threshold

0.5

In [41]:
income_model_lr.weights

DenseVector([0.2319, 0.7546, 0.2253, 0.2327, 0.0477, 0.0975, -0.004, -0.119, 0.2396, -0.0497, -0.1348, 0.0904, 0.0204, -0.0303, -0.2769, -0.0849, 0.2171, 0.2217, 0.0224, -0.0209, 0.0507, -0.0262, 0.0666, -0.045, -0.0601, 0.1454, -0.127, -0.0978, -0.1871, 0.0484, -0.0532, -0.0777, 0.0123, 0.0062, -0.0111, 0.008, -0.0101, 0.0001, 0.0091, 0.0017, 0.0187, -0.0328, 0.0, -0.001, -0.0423, -0.0061, -0.0571, -0.0166, -0.0133, 0.005, 0.0125, -0.0224, 0.0])

+ Support Vector Machines

In [42]:
income_model_svm = cl.SVMWithSGD.train(
    final_data_income,
    # step = 0.95,
    miniBatchFraction=1/2.0
)

In [43]:
for t,p in zip(
    small_sample_income
      .map(lambda row: row.label)
      .collect(),
    income_model_svm.predict(
        small_sample_income
          .map(lambda row: row.features)
    ).collect()
): print(t,p)

0.0 0
0.0 1
1.0 1
0.0 1
1.0 1
0.0 1
0.0 1
0.0 1
0.0 1
0.0 1


In [44]:
income_model_svm.weights

DenseVector([0.1286, 1.0619, 0.1582, 0.0946, 0.0399, 0.0806, -0.0108, -0.0744, 0.1618, -0.1196, -0.0836, 0.0338, -0.0226, -0.0673, -0.177, -0.0568, 0.4101, 0.2562, 0.0735, -0.068, 0.0038, -0.0055, 0.0624, -0.0707, -0.0389, 0.0952, -0.123, -0.038, -0.1559, 0.0202, -0.0343, -0.0369, 0.0106, -0.0069, 0.0096, 0.0056, 0.0101, 0.0023, 0.0095, -0.0063, 0.0039, -0.0119, 0.0, 0.002, -0.0651, 0.0102, -0.1191, 0.0032, -0.009, 0.0068, 0.0006, -0.025, 0.0])

### **Building Clustering Models**

In [47]:
import pyspark.mllib.clustering as clu

model_km = clu.KMeans.train(
    final_data.map(lambda row: row[1]),
    2,
    initializationMode='random',
    seed=665
)

In [48]:
import sklearn.metrics as m

predicted = (
    model_km
      .predict(
          final_data.map(lambda row: row[1])
      )
)

predicted = predicted.collect()

true = final_data.map(lambda row: row[0]).collect()

print(m.homogeneity_score(true,predicted))
print(m.completeness_score(true,predicted))

0.18401866145343315
0.14684418754735365


### **Computing Performance Statistics**

In [49]:
import pyspark.mllib.evaluation as ev

+ Regression metrics

In [50]:
true_pred_reg = (
    final_data_hours_test
    .map(lambda row: (
        float(workhours_model_lm.predict(row.features)),
        row.label))
)

metrics_lm = ev.RegressionMetrics(true_pred_reg)



In [52]:
print('R^2 : ', metrics_lm.r2)
print('Explained Variance: ', metrics_lm.explainedVariance)
print('meanAbsoluteError: ', metrics_lm.meanAbsoluteError)

R^2 :  -6.799361371432148
Explained Variance:  1128.5622756591451
meanAbsoluteError:  29.629969048799808


+ **Classification metrics**

In [53]:
true_pred_class_lr = (
    final_data_income_test
    .map(lambda row: (
        float(income_model_lr.predict(row.features))
        , row.label))
)

metrics_lr = ev.BinaryClassificationMetrics(true_pred_class_lr)

print('areaUnderPR: ', metrics_lr.areaUnderPR)
print('areaUnderROC: ', metrics_lr.areaUnderROC)

areaUnderPR:  0.45621828995529357
areaUnderROC:  0.7915930212376715


In [54]:
trainErr = (
    true_pred_class_lr
    .filter(lambda lp: lp[0] != lp[1]).count()
    / float(true_pred_class_lr.count())
)
print("Training Error = " + str(trainErr))

Training Error = 0.272375127420999


In [55]:
true_pred_class_svm = (
    final_data_income_test
    .map(lambda row: (
        float(income_model_svm.predict(row.features))
        , row.label))
)

metrics_svm = ev.BinaryClassificationMetrics(true_pred_class_svm)

print('areaUnderPR: ', metrics_svm.areaUnderPR)
print('areaUnderROC: ', metrics_svm.areaUnderROC)

areaUnderPR:  0.43845379317246325
areaUnderROC:  0.7755340850026171


In [56]:
trainErr = (
    true_pred_class_svm
    .filter(lambda lp: lp[0] != lp[1]).count()
    / float(true_pred_class_svm.count())
)

print("Training Error = " + str(trainErr))

Training Error = 0.28950050968399593


+ Confusion matrix

In [57]:
(
    true_pred_class_lr
    .map(lambda el: ((el), 1))
    .reduceByKey(lambda x,y: x+y)
    .take(4)
)

[((0.0, 0.0), 4983), ((1.0, 1.0), 2155), ((1.0, 0.0), 2471), ((0.0, 1.0), 201)]

In [58]:
(
    true_pred_class_svm
    .map(lambda el: ((el), 1))
    .reduceByKey(lambda x,y: x+y)
    .take(4)
)

[((0.0, 0.0), 4848), ((1.0, 1.0), 2122), ((1.0, 0.0), 2606), ((0.0, 1.0), 234)]