In [844]:
#----------------------------------------------------STAGING/EXTRACTION------------------------------------------

In [1390]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime


#Configure spark session
spark = SparkSession.builder.master('local[2]').appName('etl_app').config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.1').getOrCreate()

# Extract the dataset "studentdatabase.json" and add it to dataframe
df_load = spark.read.csv(r"/Users/Elimane/SPARK/data/5_DATA_dataset.csv", header=True)
df_load.show(2)

+----+-----+-------+-----------+----------+--------------------+-----------------+-----------------+-----------+----------+-----------+--------------+--------------+-------+--------+----+--------+------------+-------------+-------------+----------+
|  id|  nom| prenom|       sexe| birthdate|               email|        telephone|pays_de_residence|nationalite|universite|     campus|annee_en_cours|date_admission|diplome|activite|ects|   etude|type_contrat|debut_contrat|duree_contrat|entreprise|
+----+-----+-------+-----------+----------+--------------------+-----------------+-----------------+-----------+----------+-----------+--------------+--------------+-------+--------+----+--------+------------+-------------+-------------+----------+
|null| null|   null|       null|      null|                null|             null|             null|       null|      null|       null|          null|          null|   null|    null|null|    null|        null|         null|         null|      null|
|   

In [1391]:
#----------------------------------------------------CLEANSING/TRANSFORMATION------------------------------------------

In [1392]:
# Remove null values
df_load = df_load.dropna()
df_load.show(2)

+---+------+-------+-----------+----------+--------------------+-----------------+-----------------+-----------+----------+-----------+--------------+--------------+-------+--------+----+--------+------------+-------------+-------------+----------+
| id|   nom| prenom|       sexe| birthdate|               email|        telephone|pays_de_residence|nationalite|universite|     campus|annee_en_cours|date_admission|diplome|activite|ects|   etude|type_contrat|debut_contrat|duree_contrat|entreprise|
+---+------+-------+-----------+----------+--------------------+-----------------+-----------------+-----------+----------+-----------+--------------+--------------+-------+--------+----+--------+------------+-------------+-------------+----------+
|  1| Wilie|Budgett|Genderqueer|11/22/2000|wbudgett0@guardia...|+380 564 767 6866|          MAURICE|    Ukraine|     ISIMA|CLERMONT-FD|             2|    14/11/2019|   VRAI|     BDS|  14|en cours|       Stage|    2/24/2019|           22|    SOGETI|
|  2

In [1393]:
# Print schema
df_load.printSchema()

root
 |-- id: string (nullable = true)
 |-- nom: string (nullable = true)
 |-- prenom: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- email: string (nullable = true)
 |-- telephone: string (nullable = true)
 |-- pays_de_residence: string (nullable = true)
 |-- nationalite: string (nullable = true)
 |-- universite: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- annee_en_cours: string (nullable = true)
 |-- date_admission: string (nullable = true)
 |-- diplome: string (nullable = true)
 |-- activite: string (nullable = true)
 |-- ects: string (nullable = true)
 |-- etude: string (nullable = true)
 |-- type_contrat: string (nullable = true)
 |-- debut_contrat: string (nullable = true)
 |-- duree_contrat: string (nullable = true)
 |-- entreprise: string (nullable = true)



In [1394]:
# Convert some string fields into Date
df_load = df_load['nom','prenom','sexe','birthdate','pays_de_residence','nationalite','universite','campus','annee_en_cours','date_admission','diplome','activite','ects','etude','type_contrat','debut_contrat','duree_contrat','entreprise']
df_load = df_load.withColumn('birthdate',date_format(to_date(col("birthdate"), "M/d/y"),"dd/MM/y")).withColumn('date_admission',date_format(to_date(col("date_admission"), "d/M/y"),"dd/MM/y")).withColumn('debut_contrat',date_format(to_date(col("debut_contrat"), "M/d/y"),"dd/MM/y"))
df_load.printSchema()
df_load.show(6)

root
 |-- nom: string (nullable = true)
 |-- prenom: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- pays_de_residence: string (nullable = true)
 |-- nationalite: string (nullable = true)
 |-- universite: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- annee_en_cours: string (nullable = true)
 |-- date_admission: string (nullable = true)
 |-- diplome: string (nullable = true)
 |-- activite: string (nullable = true)
 |-- ects: string (nullable = true)
 |-- etude: string (nullable = true)
 |-- type_contrat: string (nullable = true)
 |-- debut_contrat: string (nullable = true)
 |-- duree_contrat: string (nullable = true)
 |-- entreprise: string (nullable = true)

+--------+-----------+-----------+----------+-----------------+-----------+----------+-----------+--------------+--------------+-------+--------+----+--------+------------+-------------+-------------+----------+
|     nom|     prenom|       sexe| birth

In [1395]:
# Convert some string fields into numeric(Double)
df_load = df_load.withColumn('ects', df_load['ects'].cast(DoubleType()))\
    .withColumn('duree_contrat', df_load['duree_contrat'].cast(DoubleType()))\
    .withColumn('annee_en_cours', df_load['annee_en_cours'].cast(DoubleType()))


# Print schema
df_load.show(5)

+-------+-----------+-----------+----------+-----------------+-----------+----------+-----------+--------------+--------------+-------+--------+-----+--------+------------+-------------+-------------+----------+
|    nom|     prenom|       sexe| birthdate|pays_de_residence|nationalite|universite|     campus|annee_en_cours|date_admission|diplome|activite| ects|   etude|type_contrat|debut_contrat|duree_contrat|entreprise|
+-------+-----------+-----------+----------+-----------------+-----------+----------+-----------+--------------+--------------+-------+--------+-----+--------+------------+-------------+-------------+----------+
|  Wilie|    Budgett|Genderqueer|22/11/2000|          MAURICE|    Ukraine|     ISIMA|CLERMONT-FD|           2.0|    14/11/2019|   VRAI|     BDS| 14.0|en cours|       Stage|   24/02/2019|         22.0|    SOGETI|
| Orelie|    Andover|   Bigender|09/02/2002|           ITALIE| Kazakhstan|     ISIMA|       LYON|           3.0|    30/09/2020|   FAUX|     BDS|  2.0|  

In [1396]:
# 1-The most successful students, depending on the region / institution of origin
students_tmp_df = df_load['nom','prenom','birthdate','ects','diplome','type_contrat','duree_contrat','etude','campus','activite','entreprise','universite']
students_tmp_df = students_tmp_df.withColumn('diplome',when(col('ects') >= 300 , 'VRAI').otherwise('FAUX'))
successful_df = students_tmp_df.filter((col('diplome') == 'VRAI'))
successful_df.show()

+---------+--------+----------+-----+-------+--------------------+-------------+-------+------+--------+--------------+----------+
|      nom|  prenom| birthdate| ects|diplome|        type_contrat|duree_contrat|  etude|campus|activite|    entreprise|universite|
+---------+--------+----------+-----+-------+--------------------+-------------+-------+------+--------+--------------+----------+
|      Nap| Satford|28/11/2000|320.0|   VRAI|               Stage|          4.0|termine|  CAEN|     SPR|AFRIK-SOLUTION|  POLYTECH|
|Cleveland|   Matis|16/06/1999|300.0|   VRAI|Contrat d'apprent...|          3.0|  stope| PARIS|     BDE|     CAPGEMINI|  POLYTECH|
|  Carmine|Spellard|11/11/1993|300.0|   VRAI|         Contrat Pro|          2.0|  stope| LILLE|     BDS|      MICHELIN|        42|
+---------+--------+----------+-----+-------+--------------------+-------------+-------+------+--------+--------------+----------+



In [1397]:
# 2-the students who stop their studies
students_tmp_df = students_tmp_df.withColumn('etude',when(col('etude') == 'stope' , 'stop').otherwise(col('etude')))
students_tmp_df = students_tmp_df.withColumn('campus',when(col('campus') == 'CLERMONT-FD' , 'CLERMONT').when(col('campus') == 'QUAND' , 'CAEN')\
                       .when(col('campus') == 'Clermon' , 'CLERMONT')\
                       .when(col('campus') == 'CLF' , 'CLERMONT')\
                       .when(col('campus') == 'cancan' , 'CAEN')\
                       .when(col('campus') == 'can' , 'CAEN')\
                       .when(col('campus') == 'lion' , 'LYON')\
                       .otherwise(col('campus')))
student_df = students_tmp_df

student_stopped = students_tmp_df.filter((col('etude') == 'stop'))
student_stopped.show(6)

+--------+-----------+----------+-----+-------+--------------------+-------------+-----+--------+--------+--------------+----------+
|     nom|     prenom| birthdate| ects|diplome|        type_contrat|duree_contrat|etude|  campus|activite|    entreprise|universite|
+--------+-----------+----------+-----+-------+--------------------+-------------+-----+--------+--------+--------------+----------+
|  Orelie|    Andover|09/02/2002|  2.0|   FAUX|         Contrat Pro|         24.0| stop|    LYON|     BDS|           CGI|     ISIMA|
| Hermann|Littlefield|19/02/1993|160.0|   FAUX|               Stage|         13.0| stop|   LILLE|     BDS|           MRC|  POLYTECH|
|  Elwood|    Pettman|26/04/1997| 24.0|   FAUX|               Stage|          6.0| stop|   LILLE|     BDE|     SOFTTEAMS|  POLYTECH|
|     Zia|    Cockton|17/08/1999| 15.0|   FAUX|         Contrat Pro|          3.0| stop|CLERMONT|     BDS|           MRC|     ISIMA|
|    Milo|     Marien|15/12/1997|201.0|   FAUX|Contrat d'apprent...| 

In [1398]:
# 3- Number of students by region
nb_students_by_region = students_tmp_df.select('nom','prenom','campus').groupBy('campus').count().withColumnRenamed('count','nombre etudiants')
nb_students_by_region.show()

+----------+----------------+
|    campus|nombre etudiants|
+----------+----------------+
|     LILLE|              72|
|     PARIS|              67|
|      LYON|              73|
|MONTPELIER|              84|
|      CAEN|              75|
|  CLERMONT|              80|
+----------+----------------+



In [1399]:
# 4- Number of students by activity (To know how revitalize campuses)
nb_students_by_activity = students_tmp_df.select('nom','prenom','activite').groupBy('activite').count().withColumnRenamed('count','nombre etudiants')
nb_students_by_activity.show()

+--------+----------------+
|activite|nombre etudiants|
+--------+----------------+
|     SPR|             152|
|     BDS|             156|
|     BDE|             143|
+--------+----------------+



In [1400]:
# Average length of time graduates are hired
avg_length_time_hired = students_tmp_df.select((sum('duree_contrat')/count('nom')).alias('Moyenne du temps de contrat'))
avg_length_time_hired.show()

+---------------------------+
|Moyenne du temps de contrat|
+---------------------------+
|         12.403547671840355|
+---------------------------+



In [1401]:
nb_total_students = students_tmp_df.select(count('nom'))

In [1404]:
# Number of students by companies for which recruit the most students from supinfo
nb_students_by_enterprise = students_tmp_df.select('nom','prenom','entreprise','universite').filter(col('universite') == 'SUPINFO').groupBy('entreprise').count().withColumnRenamed('count','nombre etudiants')
nb_students_by_enterprise = nb_students_by_enterprise.select(col('entreprise'),col('nombre etudiants')).orderBy(col('nombre etudiants').desc())

nb_students_by_enterprise.show()

+----------------+----------------+
|      entreprise|nombre etudiants|
+----------------+----------------+
|            OPEN|               3|
|         ABYSTER|               3|
|          ORACLE|               3|
|            AKKA|               2|
|             MRC|               2|
|          ALTRAM|               2|
|             IUC|               2|
|       CDISCOUNT|               2|
|          SAFRAM|               2|
|            AXXA|               1|
|       SOFTTEAMS|               1|
|       CAPGEMINI|               1|
|             CGI|               1|
|ADVENCE CAMEROUN|               1|
|          SHINKA|               1|
|          SOGETI|               1|
|        ALL4TEST|               1|
|       AFROLOGIX|               1|
|             ASI|               1|
|        MICHELIN|               1|
+----------------+----------------+
only showing top 20 rows



In [1405]:
# SUPINFO's competitors who are poaching  students
competitors_list = students_tmp_df.select('universite').filter(col('universite') != 'SUPINFO').dropDuplicates()
competitors_list.show()

+---------------+
|     universite|
+---------------+
|            101|
|             42|
|            ESI|
|        EPYTECH|
|           INSA|
|ECOLE DE LA VIE|
|          MINES|
|          ISIMA|
| UNIVERSITY 365|
|        EPITECH|
|       POLYTECH|
+---------------+



In [1406]:
# regions which have more Pro contracts and why
nb_contrat_pro_by_region = students_tmp_df.select('nom','prenom','campus','type_contrat').groupBy('campus','type_contrat').count().filter(col('type_contrat') == 'Contrat Pro').withColumnRenamed('count','nombre contrats pro')
nb_contrat_pro_by_region.show()

+----------+------------+-------------------+
|    campus|type_contrat|nombre contrats pro|
+----------+------------+-------------------+
|      CAEN| Contrat Pro|                 27|
|  CLERMONT| Contrat Pro|                 29|
|MONTPELIER| Contrat Pro|                 21|
|      LYON| Contrat Pro|                 20|
|     LILLE| Contrat Pro|                 23|
|     PARIS| Contrat Pro|                 25|
+----------+------------+-------------------+



In [1407]:
# Students age
students_age = students_tmp_df.select('nom','prenom','birthdate').withColumn("age",year(current_date()) - year(to_date(col('birthdate'),"d/M/y")))
students_age.show()

+---------+-----------+----------+---+
|      nom|     prenom| birthdate|age|
+---------+-----------+----------+---+
|    Wilie|    Budgett|22/11/2000| 22|
|   Orelie|    Andover|09/02/2002| 20|
|    Rurik|   Prodrick|05/05/1996| 26|
|  Hermann|Littlefield|19/02/1993| 29|
|  Laraine|   Trigwell|07/01/1996| 26|
| Katheryn|    Moreinu|15/05/1993| 29|
|    Spike|     Fearon|09/11/2000| 22|
| Brantley|       Okie|03/09/1998| 24|
|   Elwood|    Pettman|26/04/1997| 25|
|      Zia|    Cockton|17/08/1999| 23|
|     Milo|     Marien|15/12/1997| 25|
| Mariette|     Catton|11/05/1997| 25|
|Bridgette|  Whitehead|09/01/1998| 24|
|  Stephen|          $|22/01/1994| 28|
| Hernando|      Lortz|23/12/1997| 25|
|   Cordey|  Muggleton|07/09/1993| 29|
|Patrizius|       Garz|05/01/1998| 24|
|    Ashly|  Rittmeyer|24/12/1999| 23|
|   Jessee|     Willan|24/06/2000| 22|
|  Yanaton|    Barnaby|24/05/2000| 22|
+---------+-----------+----------+---+
only showing top 20 rows



In [1408]:
# SUPINFO students
students_of_SUPINFO = students_tmp_df.select('nom','prenom','campus','universite').where(col('universite') == 'SUPINFO').dropDuplicates()
students_of_SUPINFO.show()

+---------+---------+----------+----------+
|      nom|   prenom|    campus|universite|
+---------+---------+----------+----------+
|    Welsh|   Gluyas|      CAEN|   SUPINFO|
|      Wit|Dukesbury|MONTPELIER|   SUPINFO|
|Frederica|   Eilert|MONTPELIER|   SUPINFO|
|     Boyd|     Lush|MONTPELIER|   SUPINFO|
|    Addie|  Gowlett|  CLERMONT|   SUPINFO|
| Henrieta|  Aleksic|     PARIS|   SUPINFO|
| Harmonie|   Celand|MONTPELIER|   SUPINFO|
|Evangelia|  Kermitt|MONTPELIER|   SUPINFO|
|    Ronna| Radbourn|MONTPELIER|   SUPINFO|
|  Cherice|Englishby|     PARIS|   SUPINFO|
|   Sydney|Glencross|MONTPELIER|   SUPINFO|
|    Moyna|  Mucklow|     LILLE|   SUPINFO|
| Christel| Taunton.|MONTPELIER|   SUPINFO|
|      Gav|   Kilpin|      LYON|   SUPINFO|
|     Susy| Landells|     LILLE|   SUPINFO|
|     Iver|  Camblin|      CAEN|   SUPINFO|
|  Charlot|  Harower|      CAEN|   SUPINFO|
|    Artie| Harborow|      LYON|   SUPINFO|
|  Kathlin|   Durgan|     PARIS|   SUPINFO|
|   Alford| Trosdall|     LILLE|

In [1409]:
# number of students by SUPINFO campuses
nb_students_of_SUPINFO_by_campuses = students_of_SUPINFO.select('nom','prenom','campus').groupBy('campus').count().withColumnRenamed('count','nombre etudiants')
nb_students_of_SUPINFO_by_campuses.show()

+----------+----------------+
|    campus|nombre etudiants|
+----------+----------------+
|     LILLE|               5|
|     PARIS|               7|
|      LYON|               7|
|MONTPELIER|              10|
|      CAEN|               5|
|  CLERMONT|               3|
+----------+----------------+



In [1410]:
# Total number of SUPINFO's students
total_supinfo_students = nb_students_of_SUPINFO_by_campuses.select(sum('nombre etudiants').alias('total supinfo students'))
total_supinfo_students.show()

+----------------------+
|total supinfo students|
+----------------------+
|                    37|
+----------------------+



In [1411]:
# campus geographic coordinates
students_tmp_df = students_tmp_df.select('campus')\
         
campus_geo_location = students_tmp_df.select('campus')\
         .withColumn('longitude',when(col('campus') == 'CLERMONT' , 45.783100)\
                       .when(col('campus') == 'CAEN' , -4.490000)\
                       .when(col('campus') == 'LYON' , 4.834277)\
                       .when(col('campus') == 'MONTPELIER' , 3.877200)\
                       .when(col('campus') == 'LILLE' , 3.057256)\
                       .when(col('campus') == 'PARIS' , 2.349014)\
                       .otherwise(0)).withColumn('latitude',when(col('campus') == 'CLERMONT' , 3.082400)\
                       .when(col('campus') == 'CAEN' , 49.180000)\
                       .when(col('campus') == 'LYON' , 45.763420)\
                       .when(col('campus') == 'MONTPELIER' , 43.611900)\
                       .when(col('campus') == 'LILLE' , 50.629250)\
                       .when(col('campus') == 'PARIS' , 48.864716)\
                       .otherwise(0)).dropDuplicates()
                     


campus_geo_location.show()

+----------+---------+---------+
|    campus|longitude| latitude|
+----------+---------+---------+
|      CAEN|    -4.49|    49.18|
|     LILLE| 3.057256| 50.62925|
|      LYON| 4.834277| 45.76342|
|     PARIS| 2.349014|48.864716|
|MONTPELIER|   3.8772|  43.6119|
|  CLERMONT|  45.7831|   3.0824|
+----------+---------+---------+



In [1412]:
#----------------------------------------------------STORING/LOADING------------------------------------------
# Build the tables or collections
# Write dataframes to mongodb

In [1413]:

nb_contrat_pro_by_region.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_contrat_pro_by_region').save()



In [1414]:
competitors_list.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.competitors_list').save()


In [1415]:
nb_students_by_enterprise.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_students_by_enterprise').save()


In [1416]:

nb_total_students.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_total_students').save()


In [1417]:

avg_length_time_hired.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.avg_length_time_hired').save()

In [1418]:
nb_students_by_activity.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_students_by_activity').save()


In [1419]:
nb_students_by_region.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_students_by_region').save()


In [1420]:
student_stopped.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_students_by_region').save()


In [1421]:
successful_df.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.successful_df').save()


In [1422]:
students_age.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.students_age').save()


In [1423]:
campus_geo_location.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.campus_geo_location').save()

In [1424]:
student_df.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.students_df').save()

In [1425]:
students_of_SUPINFO.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.students_of_SUPINFO').save()

In [1426]:
nb_students_of_SUPINFO_by_campuses.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_students_of_SUPINFO_by_campuses').save()

In [1427]:
total_supinfo_students.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.total_supinfo_students').save()

In [1376]:
#-----------------------------------------------Machine Learning Part---------------------------------------------------------

In [1377]:
#----------------------------------------------ML MODELS------------------------------------------------------------

In [1378]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [1383]:
# Load students data from mongodb
#students_cleaned_loaded_df = spark.read.format('mongo').option('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/studentsdb.students_df').load()
nb_students_by_region_loaded = spark.read.format('mongo').option('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_students_by_region').load()
total_supinfo_students_loaded = spark.read.format('mongo').option('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/studentsdb.total_supinfo_students').load()

nb_students_by_region_loaded.show()
total_supinfo_students_loaded.show()
nb_students_by_region_loaded.printSchema()

+--------------------+----------+----------------+
|                 _id|    campus|nombre etudiants|
+--------------------+----------+----------------+
|{61db677cc81e1457...|     LILLE|              72|
|{61db677cc81e1457...|     PARIS|              67|
|{61db677cc81e1457...|      LYON|              73|
|{61db677cc81e1457...|MONTPELIER|              84|
|{61db677cc81e1457...|      CAEN|              75|
|{61db677cc81e1457...|  CLERMONT|              80|
+--------------------+----------+----------------+

+--------------------+----------------------+
|                 _id|total supinfo students|
+--------------------+----------------------+
|{61db6753c81e1457...|                    37|
+--------------------+----------------------+

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- nombre etudiants: long (nullable = true)



In [1384]:

# school's growth forecasts (To predict total number of students by campus)

# Select features to parse into our model and then create the feature vector
assembler = VectorAssembler(inputCols=['nombre etudiants'], outputCol='features')

# Create the model
model_reg = RandomForestRegressor(featuresCol='features', labelCol='nombre etudiants')


# Chain the assembler with the model in a pipeline
pipeline = Pipeline(stages=[assembler, model_reg])

# Train the model
model = pipeline.fit(nb_students_by_region_loaded)

# Make the prediction of number of students by regions
pred_results_forecast = model.transform(nb_students_by_region_loaded)
pred_results_forecast = pred_results_forecast.withColumnRenamed('prediction','nombre etudiants(Prediction)')
pred_results_forecast.show(5)

22/01/09 22:53:55 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 6 (= number of training instances)


+--------------------+----------+----------------+--------+----------------------------+
|                 _id|    campus|nombre etudiants|features|nombre etudiants(Prediction)|
+--------------------+----------+----------------+--------+----------------------------+
|{61db677cc81e1457...|     LILLE|              72|  [72.0]|                        73.0|
|{61db677cc81e1457...|     PARIS|              67|  [67.0]|                        69.1|
|{61db677cc81e1457...|      LYON|              73|  [73.0]|                        73.7|
|{61db677cc81e1457...|MONTPELIER|              84|  [84.0]|                        81.1|
|{61db677cc81e1457...|      CAEN|              75|  [75.0]|                        76.1|
+--------------------+----------+----------------+--------+----------------------------+
only showing top 5 rows



In [1387]:
# Load the prediction dataset into mongodb
# Write df_pred_results
nb_supinfo_students_by_region_prediction = pred_results_forecast.drop('features').withColumn("nombre etudiants(Prediction)",col('nombre etudiants(Prediction)').cast(IntegerType()))
nb_supinfo_students_by_region_prediction.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/studentsdb.nb_supinfo_students_by_region_prediction').save()