In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Missing").getOrCreate()

In [3]:
## Reading the dataset
training = spark.read.csv('filter.csv', header=True, inferSchema=True)
training.show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   David| 21|        10| 30000|
|    Paul| 31|         3| 25000|
|Jonathan| 26|         6| 18000|
| Phillip| 19|         4| 16000|
|Suleiman| 23|         5| 21000|
+--------+---+----------+------+



In [4]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [9]:
#  [Age, Experience]-------> new feature----->independent feature
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["Age","Experience"],outputCol="Independent Features")

In [10]:
output = featureassembler.transform(training)
output.show()

+--------+---+----------+------+--------------------+
|    Name|Age|Experience|Salary|Independent Features|
+--------+---+----------+------+--------------------+
|   David| 21|        10| 30000|         [21.0,10.0]|
|    Paul| 31|         3| 25000|          [31.0,3.0]|
|Jonathan| 26|         6| 18000|          [26.0,6.0]|
| Phillip| 19|         4| 16000|          [19.0,4.0]|
|Suleiman| 23|         5| 21000|          [23.0,5.0]|
+--------+---+----------+------+--------------------+



In [11]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [15]:
finalized_data = output.select("Independent Features", "Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [21.0,10.0]| 30000|
|          [31.0,3.0]| 25000|
|          [26.0,6.0]| 18000|
|          [19.0,4.0]| 16000|
|          [23.0,5.0]| 21000|
+--------------------+------+



In [23]:
from pyspark.ml.regression import LinearRegression

## Train test split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol="Independent Features", labelCol="Salary")
regressor = regressor.fit(train_data)

24/06/17 22:58:54 WARN Instrumentation: [55de4f81] regParam is zero, which might cause numerical instability and overfitting.


In [24]:
### Coefficients
regressor.coefficients

DenseVector([659.7622, 1729.9578])

In [25]:
### Intercepts
regressor.intercept

-3522.0560030688916

In [None]:
### Prediction
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

## Linear Regression with Pyspark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test").getOrCreate()

In [11]:
file_path = 'Dataset_salary.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show()

+----------------+--------------------+------+------------+------------------+------------+
|experience_level|           job_title|salary|company_size|employee_residence|remote_ratio|
+----------------+--------------------+------+------------+------------------+------------+
|              SE|         AI Engineer|202730|           M|                US|          90|
|              SE|         AI Engineer| 92118|           M|                US|          40|
|              SE|       Data Engineer|130500|           M|                US|          36|
|              SE|       Data Engineer| 96000|           M|                US|          55|
|              SE|Machine Learning ...|190000|           M|                US|          26|
|              SE|Machine Learning ...|160000|           M|                US|          33|
|              MI|         ML Engineer|400000|           M|                US|          16|
|              MI|         ML Engineer| 65000|           M|                US|  

In [None]:
df.printSchema()

In [None]:
df.columns

In [8]:
### Handling Categorical values
from pyspark.ml.feature import StringIndexer


In [25]:
indexer = StringIndexer(inputCol='job_title', outputCol="Job_t_indexed")
df_r = indexer.fit(df).transform(df)
df_r.show()

+----------------+--------------------+------+------------+------------------+------------+-------------+
|experience_level|           job_title|salary|company_size|employee_residence|remote_ratio|Job_t_indexed|
+----------------+--------------------+------+------------+------------------+------------+-------------+
|              SE|         AI Engineer|202730|           M|                US|          90|          8.0|
|              SE|         AI Engineer| 92118|           M|                US|          40|          8.0|
|              SE|       Data Engineer|130500|           M|                US|          36|          2.0|
|              SE|       Data Engineer| 96000|           M|                US|          55|          2.0|
|              SE|Machine Learning ...|190000|           M|                US|          26|          3.0|
|              SE|Machine Learning ...|160000|           M|                US|          33|          3.0|
|              MI|         ML Engineer|400000|

In [26]:
indexer = StringIndexer(inputCols=['experience_level','company_size', 'employee_residence'], 
                        outputCols=["experience_index",'size_inexed', 'residence_inedexed']
                        )
df_r = indexer.fit(df_r).transform(df_r)
df_r.show()

+----------------+--------------------+------+------------+------------------+------------+-------------+----------------+-----------+------------------+
|experience_level|           job_title|salary|company_size|employee_residence|remote_ratio|Job_t_indexed|experience_index|size_inexed|residence_inedexed|
+----------------+--------------------+------+------------+------------------+------------+-------------+----------------+-----------+------------------+
|              SE|         AI Engineer|202730|           M|                US|          90|          8.0|             0.0|        0.0|               0.0|
|              SE|         AI Engineer| 92118|           M|                US|          40|          8.0|             0.0|        0.0|               0.0|
|              SE|       Data Engineer|130500|           M|                US|          36|          2.0|             0.0|        0.0|               0.0|
|              SE|       Data Engineer| 96000|           M|                U

In [29]:
df_r.columns

['experience_level',
 'job_title',
 'salary',
 'company_size',
 'employee_residence',
 'remote_ratio',
 'Job_t_indexed',
 'experience_index',
 'size_inexed',
 'residence_inedexed']

In [37]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['remote_ratio', 'Job_t_indexed',\
                            'size_inexed', 'residence_inedexed'],
                outputCol="Independent Features"
                )
output = featureassembler.transform(df_r) 

In [38]:
output.show()

+----------------+--------------------+------+------------+------------------+------------+-------------+----------------+-----------+------------------+--------------------+
|experience_level|           job_title|salary|company_size|employee_residence|remote_ratio|Job_t_indexed|experience_index|size_inexed|residence_inedexed|Independent Features|
+----------------+--------------------+------+------------+------------------+------------+-------------+----------------+-----------+------------------+--------------------+
|              SE|         AI Engineer|202730|           M|                US|          90|          8.0|             0.0|        0.0|               0.0|  [90.0,8.0,0.0,0.0]|
|              SE|         AI Engineer| 92118|           M|                US|          40|          8.0|             0.0|        0.0|               0.0|  [40.0,8.0,0.0,0.0]|
|              SE|       Data Engineer|130500|           M|                US|          36|          2.0|             0.0|   

In [39]:
output.select("Independent Features").show()

+--------------------+
|Independent Features|
+--------------------+
|  [90.0,8.0,0.0,0.0]|
|  [40.0,8.0,0.0,0.0]|
|  [36.0,2.0,0.0,0.0]|
|  [55.0,2.0,0.0,0.0]|
|  [26.0,3.0,0.0,0.0]|
|  [33.0,3.0,0.0,0.0]|
|  [16.0,7.0,0.0,0.0]|
|  [30.0,7.0,0.0,0.0]|
|      (4,[0],[82.0])|
|      (4,[0],[81.0])|
|  [64.0,2.0,0.0,0.0]|
|  [78.0,2.0,0.0,0.0]|
| [88.0,18.0,0.0,0.0]|
| [95.0,18.0,0.0,0.0]|
|  [22.0,1.0,0.0,0.0]|
|  [25.0,1.0,0.0,0.0]|
|      (4,[0],[65.0])|
|      (4,[0],[89.0])|
|  [82.0,9.0,2.0,0.0]|
|  [12.0,9.0,2.0,0.0]|
+--------------------+
only showing top 20 rows



In [40]:
finalized_data = output.select("Independent Features", "Salary")

In [41]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|  [90.0,8.0,0.0,0.0]|202730|
|  [40.0,8.0,0.0,0.0]| 92118|
|  [36.0,2.0,0.0,0.0]|130500|
|  [55.0,2.0,0.0,0.0]| 96000|
|  [26.0,3.0,0.0,0.0]|190000|
|  [33.0,3.0,0.0,0.0]|160000|
|  [16.0,7.0,0.0,0.0]|400000|
|  [30.0,7.0,0.0,0.0]| 65000|
|      (4,[0],[82.0])|101520|
|      (4,[0],[81.0])| 45864|
|  [64.0,2.0,0.0,0.0]|172469|
|  [78.0,2.0,0.0,0.0]|114945|
| [88.0,18.0,0.0,0.0]|200000|
| [95.0,18.0,0.0,0.0]|150000|
|  [22.0,1.0,0.0,0.0]|156450|
|  [25.0,1.0,0.0,0.0]|119200|
|      (4,[0],[65.0])|170000|
|      (4,[0],[89.0])|130000|
|  [82.0,9.0,2.0,0.0]|222200|
|  [12.0,9.0,2.0,0.0]|136000|
+--------------------+------+
only showing top 20 rows



In [43]:
from pyspark.ml.regression import LinearRegression

## Training test split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol="Independent Features", labelCol="Salary")
regressor = regressor.fit(train_data)


24/06/18 12:04:17 WARN Instrumentation: [e9febbaa] regParam is zero, which might cause numerical instability and overfitting.


In [44]:
regressor.coefficients

DenseVector([660.6937, 299.1765, 32542.9769, -22904.3268])

In [47]:
regressor.intercept

97690.5581123637

In [48]:
### Predictions
pred_results = regressor.evaluate(test_data)

In [51]:
## Final comparison
final = pred_results.predictions
final.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|      (4,[0],[49.0])|112300| 130064.5493129252|
|      (4,[0],[65.0])|170000| 140635.6484804555|
|      (4,[0],[81.0])| 45864| 151206.7476479858|
|      (4,[0],[82.0])|101520| 151867.4413459564|
|      (4,[0],[87.0])| 80000|155170.90983580964|
|  [12.0,9.0,2.0,0.0]|136000|173397.42479499098|
|  [16.0,7.0,0.0,0.0]|400000|110355.89273508193|
|  [17.0,2.0,0.0,0.0]|155000|109520.70396506118|
|  [20.0,4.0,0.0,0.0]|123800|112101.13804616967|
|  [24.0,5.0,0.0,3.0]| 45000| 46330.10905944172|
|  [26.0,3.0,0.0,0.0]|190000|115766.12374039524|
|  [27.0,5.0,0.0,0.0]| 93838|117025.17042556245|
| [30.0,12.0,0.0,0.0]| 95500|121101.48697466232|
|  [35.0,4.0,0.0,0.0]|195000|122011.54351572931|
|  [41.0,2.0,0.0,0.0]|100000|125377.35271635662|
|  [46.0,5.0,0.0,0.0]| 69535|129578.35068700465|
| [55.0,16.0,0.0,0.0]|240000| 138815.5353983215|
|  [56.0,4.0,0.0,0.0