In [19]:
#----------------------------------------------------STAGING/EXTRACTION------------------------------------------

In [3]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime


#Configure spark session
spark = SparkSession.builder.master('local[2]').appName('testJson').config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.1').getOrCreate()

# Extract the dataset "studentdatabase.json" and add it to dataframe

extracted_df = spark.read.option("multiline","true").json(r"/Users/Elimane/SPARK/data/studentdatabase.json")
#multiline_df.show(2)  

#df_load = spark.read.csv(r"/Users/Elimane/SPARK/data/database.csv", header=True)
#Preview extracted_df
extracted_df.take(1)


                                                                                

[Row(data=[Row(attendance=['Sport Event', 'Forums', 'Meetings', 'Student Fair'], birthdate='12-01-1992', credits=305, curriculum=[Row(EndDate=None, business=None, contractType=None, experiences=[Row(EndDate='PRESENT', business='Worldline', contractType='Professional_Contract', position='Data engineer', startDate='01-01-2016')], position=None, skills=None, startDate=None), Row(EndDate=None, business=None, contractType=None, experiences=None, position=None, skills=['QlikView', 'HTML', 'CSS', 'Big Data', 'Python'], startDate=None)], firstname='John', graduated=True, institutionOfOrigin='SUPINFO_LYON', lastname='Doe', mobility=[], regionOfOrigin=Row(latitude=45.1695797, longitude=5.4502821, name='Rhône-Alpes'), skills=None, studies='FINISHED'), Row(attendance=['Tech Conference', 'Forums', 'Training'], birthdate='26-04-1988', credits=280, curriculum=[Row(EndDate=None, business=None, contractType=None, experiences=[Row(EndDate='30-09-2019', business='Atos', contractType='Internship', positio

In [4]:
#----------------------------------------------------CLEANSING/TRANSFORMATION------------------------------------------

In [5]:

# read all the students name
readData = extracted_df.withColumn('res',explode('data'))
students_df = readData.withColumn('res_curriculum',explode('res.curriculum')).select('res.firstname','res.lastname',col("res.birthdate"),'res.graduated','res.regionOfOrigin','res.institutionOfOrigin','res.attendance','res.credits','res_curriculum.experiences.business','res_curriculum.experiences.position','res_curriculum.experiences.startDate','res_curriculum.experiences.EndDate','res_curriculum.experiences.contractType','res.studies')

# Preview schemas
students_df.printSchema()


root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- graduated: boolean (nullable = true)
 |-- regionOfOrigin: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |    |-- name: string (nullable = true)
 |-- institutionOfOrigin: string (nullable = true)
 |-- attendance: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- credits: long (nullable = true)
 |-- business: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- position: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- startDate: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- EndDate: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contractType: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- studies: string (nullable = true

In [6]:
# 1- Most successful students [They got 300 credits or more]
successful_df = students_df.dropDuplicates(["firstname","lastname"]).select('firstname','lastname','regionOfOrigin.name','institutionOfOrigin','credits').withColumnRenamed('name', 'regionName').filter(students_df['credits'] >= 300)
successful_number_df = successful_df.dropna().count()
students_cleaned_df = students_df.dropDuplicates(["firstname","lastname"]).select('firstname','lastname','regionOfOrigin.name','regionOfOrigin.latitude','regionOfOrigin.longitude','institutionOfOrigin','credits').withColumnRenamed('name', 'regionName').drop('latitude','longitude')
students_cleaned_df
# preview 
finalResult = students_cleaned_df

finalResult.show()


+---------+--------+---------------+-------------------+-------+
|firstname|lastname|     regionName|institutionOfOrigin|credits|
+---------+--------+---------------+-------------------+-------+
|    Boris|   Baldy|      Normandie|       SUPINFO_CAEN|    310|
|     John|     Doe|    Rhône-Alpes|       SUPINFO_LYON|    305|
|   Julien|  Balade|  Île-de-France|      SUPINFO_PARIS|    322|
|   Marine|  Gascon|Hauts-de-France|      SUPINFO_LILLE|    300|
|   Martin|  Bucher|Hauts-de-France|      SUPINFO_LILLE|    210|
|    Mehdi| Haraoui|  Île-de-France|      SUPINFO_PARIS|    230|
|   Mylene| Elsinky|  Île-de-France|      SUPINFO_PARIS|    278|
|  Natalie|    Koeh|  Île-de-France|      SUPINFO_PARIS|    300|
|    Scott|   Tiger|Alpes-Maritimes|       SUPINFO_NICE|    280|
+---------+--------+---------------+-------------------+-------+



In [7]:
# 2- Students who stop their studies
students_studies_stopped = students_df.dropDuplicates(["firstname","lastname"]).select(concat('firstname',lit(" "),'lastname').alias('student_name'),'regionOfOrigin.name','institutionOfOrigin','credits','studies').withColumnRenamed('name', 'regionName').filter((students_df['studies'] == 'FINISHED') & (students_df['credits'] < 300))

# preview 
finalResult = students_studies_stopped

finalResult.show()

+--------------+-------------+-------------------+-------+--------+
|  student_name|   regionName|institutionOfOrigin|credits| studies|
+--------------+-------------+-------------------+-------+--------+
| Mehdi Haraoui|Île-de-France|      SUPINFO_PARIS|    230|FINISHED|
|Mylene Elsinky|Île-de-France|      SUPINFO_PARIS|    278|FINISHED|
+--------------+-------------+-------------------+-------+--------+



In [8]:
# 3- Students by regions and institutions 
number_students_by_regions = students_df.dropna().groupBy('regionOfOrigin.name','regionOfOrigin.latitude','regionOfOrigin.longitude','institutionOfOrigin').count().withColumnRenamed('count', 'Number_of_students').withColumnRenamed('name', 'regionName').withColumnRenamed('latitude', 'regionLatitude').withColumnRenamed('longitude', 'regionLongitude')

# preview 
finalResult = number_students_by_regions

finalResult.show()

+---------------+--------------+---------------+-------------------+------------------+
|     regionName|regionLatitude|regionLongitude|institutionOfOrigin|Number_of_students|
+---------------+--------------+---------------+-------------------+------------------+
|Alpes-Maritimes|    43.9466791|       7.179026|       SUPINFO_NICE|                 1|
|Hauts-de-France|     49.847503|       2.763062|      SUPINFO_LILLE|                 2|
|      Normandie|      48.87987|       0.171253|       SUPINFO_CAEN|                 1|
|  Île-de-France|    48.8499198|      2.6370411|      SUPINFO_PARIS|                 3|
|    Rhône-Alpes|    45.1695797|      5.4502821|       SUPINFO_LYON|                 1|
+---------------+--------------+---------------+-------------------+------------------+



In [9]:
# Regions and  By attendances
regions_by_attendances = students_df.dropDuplicates(["firstname","lastname"]).select(concat('firstname',lit(" "),'lastname').alias('student_name'),'birthdate','regionOfOrigin.name','regionOfOrigin.latitude','regionOfOrigin.longitude','attendance').withColumn("number_of_attendances",size(col("attendance"))).withColumnRenamed('attendance', 'attendances').withColumnRenamed('name', 'regionName').withColumnRenamed('latitude', 'regionLatitude').withColumnRenamed('longitude', 'regionLongitude')

# preview 
finalResult = regions_by_attendances

finalResult.show()


+--------------+----------+---------------+--------------+---------------+--------------------+---------------------+
|  student_name| birthdate|     regionName|regionLatitude|regionLongitude|         attendances|number_of_attendances|
+--------------+----------+---------------+--------------+---------------+--------------------+---------------------+
|   Boris Baldy|24-10-1989|      Normandie|      48.87987|       0.171253|[Forums, Student ...|                    2|
|      John Doe|12-01-1992|    Rhône-Alpes|    45.1695797|      5.4502821|[Sport Event, For...|                    4|
| Julien Balade|14-04-1990|  Île-de-France|    48.8499198|      2.6370411|[Tech Conference,...|                    6|
| Marine Gascon|17-05-1988|Hauts-de-France|     49.847503|       2.763062|          [Meetings]|                    1|
| Martin Bucher|12-12-1991|Hauts-de-France|     49.847503|       2.763062|[Tech Conference,...|                    3|
| Mehdi Haraoui|31-03-1990|  Île-de-France|    48.849919

In [23]:
# Change birthDate format and add right values
students_with_date_format_changed = students_df.dropDuplicates(["firstname","lastname"]).select(concat('firstname',lit(" "),'lastname').alias('student_name'),date_format(to_date('birthdate', "dd-MM-yyyy"),"dd/MM/yyyy").alias('birthdate_new_format'),'regionOfOrigin.name','attendance').withColumnRenamed('attendance', 'attendances').withColumnRenamed('name', 'regionName')

# preview 
finalResult = students_with_date_format_changed

finalResult.show()

+--------------+--------------------+---------------+--------------------+
|  student_name|birthdate_new_format|     regionName|         attendances|
+--------------+--------------------+---------------+--------------------+
|   Boris Baldy|          24/10/1989|      Normandie|[Forums, Student ...|
|      John Doe|          12/01/1992|    Rhône-Alpes|[Sport Event, For...|
| Julien Balade|          14/04/1990|  Île-de-France|[Tech Conference,...|
| Marine Gascon|          17/05/1988|Hauts-de-France|          [Meetings]|
| Martin Bucher|          12/12/1991|Hauts-de-France|[Tech Conference,...|
| Mehdi Haraoui|          31/03/1990|  Île-de-France|[Student Fair, Te...|
|Mylene Elsinky|          20/09/1993|  Île-de-France|[Tech Conference,...|
|  Natalie Koeh|          30/10/1991|  Île-de-France|[Student Fair, Te...|
|   Scott Tiger|          26/04/1988|Alpes-Maritimes|[Tech Conference,...|
+--------------+--------------------+---------------+--------------------+



In [11]:
# Calculate students ages
students_age_df = students_df.dropDuplicates(["firstname","lastname"]).select(concat('firstname',lit(" "),'lastname').alias('student_name'),date_format(to_date('birthdate',"dd-MM-yyyy"),"dd/MM/yyyy").alias("birthdate")).withColumn("age",year(current_date()) - year(to_date(col('birthdate'),"dd/MM/yyyy"))).withColumnRenamed('attendance', 'attendances')

# preview 
finalResult = students_age_df

finalResult.show()

+--------------+----------+---+
|  student_name| birthdate|age|
+--------------+----------+---+
|   Boris Baldy|24/10/1989| 33|
|      John Doe|12/01/1992| 30|
| Julien Balade|14/04/1990| 32|
| Marine Gascon|17/05/1988| 34|
| Martin Bucher|12/12/1991| 31|
| Mehdi Haraoui|31/03/1990| 32|
|Mylene Elsinky|20/09/1993| 29|
|  Natalie Koeh|30/10/1991| 31|
|   Scott Tiger|26/04/1988| 34|
+--------------+----------+---+



In [12]:
# students curriculum
students_curriculum = students_df.dropDuplicates(["firstname","lastname"]).select(concat('firstname',lit(" "),'lastname').alias('student_name'),'graduated','business','position','institutionOfOrigin','regionOfOrigin.name','contractType','startDate','EndDate').withColumnRenamed('name', 'regionName')
students_curriculum_ordered_df = students_curriculum.orderBy(col("student_name")).dropna()

# preview 
finalResult = students_df

#finalResult.show(9)


#students currently hired and not
students_with_hiring_status = students_curriculum_ordered_df.withColumn("Currently_In_Hired",array_contains(col("EndDate"),"PRESENT"))
student_hired = students_with_hiring_status.filter(col('Currently_In_Hired') == 'true')
student_not_hired = students_with_hiring_status.select('student_name','startDate','EndDate','Currently_In_Hired').filter(col('Currently_In_Hired') == 'false')

# preview 
finalResult = student_hired

finalResult.show()

+-------------+---------+-----------------+--------------------+-------------------+---------------+--------------------+------------+---------+------------------+
| student_name|graduated|         business|            position|institutionOfOrigin|     regionName|        contractType|   startDate|  EndDate|Currently_In_Hired|
+-------------+---------+-----------------+--------------------+-------------------+---------------+--------------------+------------+---------+------------------+
|  Boris Baldy|     true|         [Oracle]|     [Data engineer]|       SUPINFO_CAEN|      Normandie|[Professional_Con...|[01-01-2020]|[PRESENT]|              true|
|     John Doe|     true|      [Worldline]|     [Data engineer]|       SUPINFO_LYON|    Rhône-Alpes|[Professional_Con...|[01-01-2016]|[PRESENT]|              true|
|Julien Balade|     true|        [Amadeus]|     [Data engineer]|      SUPINFO_PARIS|  Île-de-France|[Professional_Con...|[15-04-2021]|[PRESENT]|              true|
|Marine Gascon| 

In [13]:
# number of students hired and not by region
nb_students_not_hired = students_with_hiring_status.dropDuplicates(["student_name"]).select('regionName','Currently_In_Hired','institutionOfOrigin').filter(col('Currently_In_Hired') == 'false').groupBy("regionName","institutionOfOrigin").count().withColumnRenamed('count', 'Number_of_students_not_hired')
nb_students_hired = students_with_hiring_status.dropDuplicates(["student_name"]).select('regionName','Currently_In_Hired','institutionOfOrigin').filter(col('Currently_In_Hired') == 'true').groupBy("regionName","institutionOfOrigin").count().withColumnRenamed('count', 'Number_of_students_hired')

# preview 
finalResult = nb_students_hired

finalResult.show()

+---------------+-------------------+------------------------+
|     regionName|institutionOfOrigin|Number_of_students_hired|
+---------------+-------------------+------------------------+
|    Rhône-Alpes|       SUPINFO_LYON|                       1|
|      Normandie|       SUPINFO_CAEN|                       1|
|Hauts-de-France|      SUPINFO_LILLE|                       1|
|  Île-de-France|      SUPINFO_PARIS|                       2|
+---------------+-------------------+------------------------+



In [14]:
# Companies which recruit the most supinfo's students
companies_occurences_df = students_with_hiring_status.withColumn('businesses', explode('business')).groupBy('businesses').count()
most_reccurent_companies_df = companies_occurences_df.withColumnRenamed('count', 'companies_occurences').filter(col('companies_occurences') > 1)

# preview 
finalResult = most_reccurent_companies_df

finalResult.show()

+----------+--------------------+
|businesses|companies_occurences|
+----------+--------------------+
|   Amadeus|                   2|
+----------+--------------------+



In [15]:
# Graduated students
graduated_students = students_with_hiring_status['student_name','regionName','graduated','business','startDate','EndDate','contractType'].filter(col('graduated') == 'true')

# preview 
finalResult = graduated_students

finalResult.show()

+-------------+---------------+---------+-----------------+------------+---------+--------------------+
| student_name|     regionName|graduated|         business|   startDate|  EndDate|        contractType|
+-------------+---------------+---------+-----------------+------------+---------+--------------------+
|  Boris Baldy|      Normandie|     true|         [Oracle]|[01-01-2020]|[PRESENT]|[Professional_Con...|
|     John Doe|    Rhône-Alpes|     true|      [Worldline]|[01-01-2016]|[PRESENT]|[Professional_Con...|
|Julien Balade|  Île-de-France|     true|        [Amadeus]|[15-04-2021]|[PRESENT]|[Professional_Con...|
|Marine Gascon|Hauts-de-France|     true|[Societe General]|[12-05-2021]|[PRESENT]|[Professional_Con...|
| Natalie Koeh|  Île-de-France|     true|          [Extia]|[11-08-2021]|[PRESENT]|[Professional_Con...|
+-------------+---------------+---------+-----------------+------------+---------+--------------------+



In [16]:
# average length of time graduates are hired

hired_times = graduated_students['student_name','regionName','startDate','EndDate','contractType'].withColumn('EndDate',explode('EndDate')).withColumn('startDate',explode('startDate')).withColumn('contractType',explode('contractType')).withColumnRenamed("name","RegionName")
hired_times_cleaned = hired_times.where((col('EndDate') == "PRESENT") & (col('contractType') == "Professional_Contract")).withColumn('EndDate',when(col('EndDate') == "PRESENT",date_format(to_date(current_date(),"dd-MM-yyyy"),"dd-MM-yyyy")).otherwise(col('EndDate'))).withColumn('years', (datediff(to_date('EndDate',"dd-MM-yyyy"),to_date('startDate',"dd-MM-yyyy")))/365)
years = hired_times_cleaned.drop('student_name','startDate','EndDate','contractType')
avg = years.select(round((sum(years['years'])/count(years['years']))).alias('average_length_time_hired_years'))

hired_times_cleaned.show()

+-------------+---------------+----------+----------+--------------------+------------------+
| student_name|     regionName| startDate|   EndDate|        contractType|             years|
+-------------+---------------+----------+----------+--------------------+------------------+
|  Boris Baldy|      Normandie|01-01-2020|08-01-2022|Professional_Cont...| 2.021917808219178|
|     John Doe|    Rhône-Alpes|01-01-2016|08-01-2022|Professional_Cont...| 6.024657534246575|
|Julien Balade|  Île-de-France|15-04-2021|08-01-2022|Professional_Cont...|0.7342465753424657|
|Marine Gascon|Hauts-de-France|12-05-2021|08-01-2022|Professional_Cont...|0.6602739726027397|
| Natalie Koeh|  Île-de-France|11-08-2021|08-01-2022|Professional_Cont...| 0.410958904109589|
+-------------+---------------+----------+----------+--------------------+------------------+



In [17]:
#Pro contracts by regions
pro_contract_by_region = hired_times_cleaned['RegionName','contractType'].groupBy('RegionName','contractType').count().withColumnRenamed('count','number_of_pro_contracts').where(col('number_of_pro_contracts') > 1)
pro_contract_by_region.show()


+-------------+--------------------+-----------------------+
|   RegionName|        contractType|number_of_pro_contracts|
+-------------+--------------------+-----------------------+
|Île-de-France|Professional_Cont...|                      2|
+-------------+--------------------+-----------------------+



In [18]:
# competitors who are poaching SUPINFO's students

In [19]:
#----------------------------------------------------STORING/LOADING------------------------------------------

In [20]:
# Build the tables or collections
# Write dataframes to mongodb
successful_df.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.successfull_students').save()
successful_df.show(2)

+---------+--------+-----------+-------------------+-------+
|firstname|lastname| regionName|institutionOfOrigin|credits|
+---------+--------+-----------+-------------------+-------+
|    Boris|   Baldy|  Normandie|       SUPINFO_CAEN|    310|
|     John|     Doe|Rhône-Alpes|       SUPINFO_LYON|    305|
+---------+--------+-----------+-------------------+-------+
only showing top 2 rows



In [21]:
students_studies_stopped.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.studies_stopped').save()


In [22]:
number_students_by_regions.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.number_students_by_regions').save()

In [22]:
regions_by_attendances.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.regions_by_attendances').save()

In [23]:
students_with_date_format_changed.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.students_with_date_format_changed').save()

In [24]:
students_age_df.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.students_age_df').save()


In [25]:
students_curriculum_ordered_df.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.students_curriculum_ordered_df').save()

In [26]:
student_hired.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.student_hired').save()


In [27]:
nb_students_not_hired.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.nb_students_not_hired').save()


In [28]:
pro_contract_by_region.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.pro_contract_by_region').save()

In [29]:
most_reccurent_companies_df.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.most_reccurent_companies_df').save()

In [30]:
graduated_students.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.graduated_students').save()

In [31]:
avg.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.avg_graduated_hired').save()

In [40]:
students_cleaned_df.printSchema()
students_cleaned_df.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.students_df').save()
students_cleaned_df.show()

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- regionName: string (nullable = true)
 |-- institutionOfOrigin: string (nullable = true)
 |-- credits: long (nullable = true)

+---------+--------+---------------+-------------------+-------+
|firstname|lastname|     regionName|institutionOfOrigin|credits|
+---------+--------+---------------+-------------------+-------+
|    Boris|   Baldy|      Normandie|       SUPINFO_CAEN|    310|
|     John|     Doe|    Rhône-Alpes|       SUPINFO_LYON|    305|
|   Julien|  Balade|  Île-de-France|      SUPINFO_PARIS|    322|
|   Marine|  Gascon|Hauts-de-France|      SUPINFO_LILLE|    300|
|   Martin|  Bucher|Hauts-de-France|      SUPINFO_LILLE|    210|
|    Mehdi| Haraoui|  Île-de-France|      SUPINFO_PARIS|    230|
|   Mylene| Elsinky|  Île-de-France|      SUPINFO_PARIS|    278|
|  Natalie|    Koeh|  Île-de-France|      SUPINFO_PARIS|    300|
|    Scott|   Tiger|Alpes-Maritimes|       SUPINFO_NICE|    280|
+--

In [176]:
"""

Machine Learning Part

"""

'\n\nMachine Learning Part\n\n'

In [177]:
#----------------------------------------------ML MODELS------------------------------------------------------------

In [178]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [179]:
students_cleaned_loaded_df = spark.read.format('mongo').option('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/Student.students_df').load()
students_cleaned_loaded_df.show()

+--------------------+-------+---------+-------------------+--------+--------------+---------------+---------------+
|                 _id|credits|firstname|institutionOfOrigin|lastname|regionLatitude|regionLongitude|     regionName|
+--------------------+-------+---------+-------------------+--------+--------------+---------------+---------------+
|{61d2b4d6eb71bb31...|    310|    Boris|       SUPINFO_CAEN|   Baldy|      48.87987|       0.171253|      Normandie|
|{61d2b4d6eb71bb31...|    305|     John|       SUPINFO_LYON|     Doe|    45.1695797|      5.4502821|    Rhône-Alpes|
|{61d2b4d6eb71bb31...|    322|   Julien|      SUPINFO_PARIS|  Balade|    48.8499198|      2.6370411|  Île-de-France|
|{61d2b4d6eb71bb31...|    300|   Marine|      SUPINFO_LILLE|  Gascon|     49.847503|       2.763062|Hauts-de-France|
|{61d2b4d6eb71bb31...|    210|   Martin|      SUPINFO_LILLE|  Bucher|     49.847503|       2.763062|Hauts-de-France|
|{61d2b4d6eb71bb31...|    230|    Mehdi|      SUPINFO_PARIS| Har

In [185]:
#school's growth forecasts
#Number of students by regions
students_by_regions = students_cleaned_loaded_df.select(concat('firstname',lit(" "),'lastname').alias('student_name'),'regionName','regionLatitude','regionLongitude','institutionOfOrigin')
nb_students_by_regions = students_by_regions.groupBy('regionName','regionLatitude','regionLongitude').count().withColumnRenamed('count','number_of_students')
nb_schools_by_regions = students_by_regions.groupBy('institutionOfOrigin').count().withColumnRenamed('count','number_of_schools')
nb_students_by_regions.show()

+---------------+--------------+---------------+------------------+
|     regionName|regionLatitude|regionLongitude|number_of_students|
+---------------+--------------+---------------+------------------+
|Alpes-Maritimes|    43.9466791|       7.179026|                 1|
|      Normandie|      48.87987|       0.171253|                 1|
|Hauts-de-France|     49.847503|       2.763062|                 2|
|    Rhône-Alpes|    45.1695797|      5.4502821|                 1|
|  Île-de-France|    48.8499198|      2.6370411|                 4|
+---------------+--------------+---------------+------------------+



In [186]:
# Select features to parse into our model and then create the feature vector
assembler = VectorAssembler(inputCols=['number_of_students'], outputCol='features')

# Create the model
model_reg = RandomForestRegressor(featuresCol='features', labelCol='number_of_students')


# Chain the assembler with the model in a pipeline
pipeline = Pipeline(stages=[assembler, model_reg])

# Train the model
model = pipeline.fit(nb_students_by_regions)

# Make the prediction of number of students by regions
pred_results = model.transform(nb_students_by_regions)
pred_results.show(5)

22/01/03 08:37:19 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 5 (= number of training instances)


+---------------+--------------+---------------+------------------+--------+----------+
|     regionName|regionLatitude|regionLongitude|number_of_students|features|prediction|
+---------------+--------------+---------------+------------------+--------+----------+
|Alpes-Maritimes|    43.9466791|       7.179026|                 1|   [1.0]|       1.3|
|      Normandie|      48.87987|       0.171253|                 1|   [1.0]|       1.3|
|Hauts-de-France|     49.847503|       2.763062|                 2|   [2.0]|       2.7|
|    Rhône-Alpes|    45.1695797|      5.4502821|                 1|   [1.0]|       1.3|
|  Île-de-France|    48.8499198|      2.6370411|                 4|   [4.0]|       3.5|
+---------------+--------------+---------------+------------------+--------+----------+



In [188]:
# Evaluate the model
# rmse should be less than 0.5 for the model to be useful
evaluator = RegressionEvaluator(labelCol='number_of_students', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(pred_results)
print('Root Mean Squared Error (RMSE) on test data = %g ' %rmse)

Root Mean Squared Error (RMSE) on test data = 0.449444 


In [42]:
"""

Create the prediction dataset
"""

'\n\nCreate the prediction dataset\n'

In [189]:
# Create the prediction dataset
df_pred_results = pred_results['regionName','number_of_students']

# Rename the prediction field
df_pred_results = df_pred_results.withColumnRenamed('prediction','Pred_number_of_students')

# Add more columns to our prediction dataset for 2022
df_pred_results = df_pred_results.withColumn('Year', lit(2022)).withColumn('RMSE', lit(rmse))

# Preview df_pred_results
df_pred_results.show(5)

+---------------+------------------+----+------------------+
|     regionName|number_of_students|Year|              RMSE|
+---------------+------------------+----+------------------+
|Alpes-Maritimes|                 1|2022|0.4494441010848847|
|      Normandie|                 1|2022|0.4494441010848847|
|Hauts-de-France|                 2|2022|0.4494441010848847|
|    Rhône-Alpes|                 1|2022|0.4494441010848847|
|  Île-de-France|                 4|2022|0.4494441010848847|
+---------------+------------------+----+------------------+



In [184]:
# Load the prediction dataset into mongodb
# Write df_pred_results
df_pred_results.write.format('mongo').mode('overwrite').option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Student.df_pred_results').save()